From 16cc345d4f84367e70e133200f7aa335c2aae8c6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 18 Jun 2012 22:30:26 +0930
Subject: TDB2: Goodbye TDB2, Hello NTDB.

This renames everything from tdb2 to ntdb: importantly, we no longer
use the tdb_ namespace, so you can link against both ntdb and tdb if
you want to.

This also enables building of standalone ntdb by the autobuild script.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 lib/ntdb/ABI/ntdb-0.9.sigs                     |   39 +
 lib/ntdb/LICENSE                               |  165 +
 lib/ntdb/Makefile                              |   67 +
 lib/ntdb/check.c                               |  864 +++++
 lib/ntdb/configure                             |   21 +
 lib/ntdb/doc/TDB_porting.txt                   |   65 +
 lib/ntdb/doc/design-1.3.txt                    | 1049 ++++++
 lib/ntdb/doc/design.lyx                        | 2689 ++++++++++++++
 lib/ntdb/doc/design.lyx,v                      | 4679 ++++++++++++++++++++++++
 lib/ntdb/doc/design.pdf                        |  Bin 0 -> 240440 bytes
 lib/ntdb/doc/design.txt                        | 1258 +++++++
 lib/ntdb/free.c                                |  976 +++++
 lib/ntdb/hash.c                                |  894 +++++
 lib/ntdb/io.c                                  |  650 ++++
 lib/ntdb/lock.c                                |  883 +++++
 lib/ntdb/ntdb.c                                |  605 +++
 lib/ntdb/ntdb.h                                |  901 +++++
 lib/ntdb/ntdb.pc.in                            |   11 +
 lib/ntdb/open.c                                |  768 ++++
 lib/ntdb/private.h                             |  657 ++++
 lib/ntdb/pyntdb.c                              |  591 +++
 lib/ntdb/summary.c                             |  330 ++
 lib/ntdb/test/api-12-store.c                   |   57 +
 lib/ntdb/test/api-13-delete.c                  |  205 ++
 lib/ntdb/test/api-14-exists.c                  |   54 +
 lib/ntdb/test/api-16-wipe_all.c                |   46 +
 lib/ntdb/test/api-21-parse_record.c            |   67 +
 lib/ntdb/test/api-55-transaction.c             |   73 +
 lib/ntdb/test/api-80-tdb_fd.c                  |   32 +
 lib/ntdb/test/api-81-seqnum.c                  |   69 +
 lib/ntdb/test/api-82-lockattr.c                |  237 ++
 lib/ntdb/test/api-83-openhook.c                |   96 +
 lib/ntdb/test/api-91-get-stats.c               |   57 +
 lib/ntdb/test/api-92-get-set-readonly.c        |  105 +
 lib/ntdb/test/api-93-repack.c                  |   80 +
 lib/ntdb/test/api-add-remove-flags.c           |   89 +
 lib/ntdb/test/api-check-callback.c             |   86 +
 lib/ntdb/test/api-firstkey-nextkey.c           |  159 +
 lib/ntdb/test/api-fork-test.c                  |  179 +
 lib/ntdb/test/api-locktimeout.c                |  193 +
 lib/ntdb/test/api-missing-entries.c            |   44 +
 lib/ntdb/test/api-open-multiple-times.c        |   83 +
 lib/ntdb/test/api-record-expand.c              |   51 +
 lib/ntdb/test/api-simple-delete.c              |   39 +
 lib/ntdb/test/api-summary.c                    |   58 +
 lib/ntdb/test/external-agent.c                 |  252 ++
 lib/ntdb/test/external-agent.h                 |   51 +
 lib/ntdb/test/failtest_helper.c                |   96 +
 lib/ntdb/test/failtest_helper.h                |   19 +
 lib/ntdb/test/helpapi-external-agent.c         |    7 +
 lib/ntdb/test/helprun-external-agent.c         |    7 +
 lib/ntdb/test/helprun-layout.c                 |  402 ++
 lib/ntdb/test/layout.h                         |   87 +
 lib/ntdb/test/lock-tracking.c                  |  147 +
 lib/ntdb/test/lock-tracking.h                  |   25 +
 lib/ntdb/test/logging.c                        |   30 +
 lib/ntdb/test/logging.h                        |   17 +
 lib/ntdb/test/ntdb-source.h                    |   11 +
 lib/ntdb/test/run-001-encode.c                 |   41 +
 lib/ntdb/test/run-001-fls.c                    |   33 +
 lib/ntdb/test/run-01-new_database.c            |   34 +
 lib/ntdb/test/run-02-expand.c                  |   62 +
 lib/ntdb/test/run-03-coalesce.c                |  178 +
 lib/ntdb/test/run-04-basichash.c               |  260 ++
 lib/ntdb/test/run-05-readonly-open.c           |   71 +
 lib/ntdb/test/run-10-simple-store.c            |   58 +
 lib/ntdb/test/run-11-simple-fetch.c            |   58 +
 lib/ntdb/test/run-12-check.c                   |   46 +
 lib/ntdb/test/run-15-append.c                  |  130 +
 lib/ntdb/test/run-20-growhash.c                |  137 +
 lib/ntdb/test/run-25-hashoverload.c            |  113 +
 lib/ntdb/test/run-30-exhaust-before-expand.c   |   71 +
 lib/ntdb/test/run-35-convert.c                 |   54 +
 lib/ntdb/test/run-50-multiple-freelists.c      |   70 +
 lib/ntdb/test/run-56-open-during-transaction.c |  165 +
 lib/ntdb/test/run-57-die-during-transaction.c  |  294 ++
 lib/ntdb/test/run-64-bit-tdb.c                 |   72 +
 lib/ntdb/test/run-90-get-set-attributes.c      |  159 +
 lib/ntdb/test/run-capabilities.c               |  271 ++
 lib/ntdb/test/run-expand-in-transaction.c      |   36 +
 lib/ntdb/test/run-features.c                   |   62 +
 lib/ntdb/test/run-lockall.c                    |   70 +
 lib/ntdb/test/run-remap-in-read_traverse.c     |   57 +
 lib/ntdb/test/run-seed.c                       |   61 +
 lib/ntdb/test/run-tdb_errorstr.c               |   52 +
 lib/ntdb/test/run-tdb_foreach.c                |   86 +
 lib/ntdb/test/run-traverse.c                   |  203 +
 lib/ntdb/test/tap-interface.c                  |    3 +
 lib/ntdb/test/tap-interface.h                  |   41 +
 lib/ntdb/tools/Makefile                        |   16 +
 lib/ntdb/tools/growtdb-bench.c                 |  114 +
 lib/ntdb/tools/mkntdb.c                        |   29 +
 lib/ntdb/tools/ntdbbackup.c                    |  340 ++
 lib/ntdb/tools/ntdbdump.c                      |  122 +
 lib/ntdb/tools/ntdbrestore.c                   |  231 ++
 lib/ntdb/tools/ntdbtool.c                      |  810 ++++
 lib/ntdb/tools/ntdbtorture.c                   |  529 +++
 lib/ntdb/tools/speed.c                         |  443 +++
 lib/ntdb/transaction.c                         | 1322 +++++++
 lib/ntdb/traverse.c                            |   99 +
 lib/ntdb/wscript                               |  265 ++
 lib/tdb2/ABI/tdb-2.0.0.sigs                    |   40 -
 lib/tdb2/ABI/tdb-2.0.1.sigs                    |   39 -
 lib/tdb2/LICENSE                               |  165 -
 lib/tdb2/Makefile                              |   67 -
 lib/tdb2/TODO                                  |    4 -
 lib/tdb2/_info                                 |   91 -
 lib/tdb2/check.c                               |  864 -----
 lib/tdb2/configure                             |   21 -
 lib/tdb2/doc/TDB1_porting.txt                  |   72 -
 lib/tdb2/doc/design-1.3.txt                    | 1049 ------
 lib/tdb2/doc/design.lyx                        | 2689 --------------
 lib/tdb2/doc/design.lyx,v                      | 4679 ------------------------
 lib/tdb2/doc/design.pdf                        |  Bin 240440 -> 0 bytes
 lib/tdb2/doc/design.txt                        | 1258 -------
 lib/tdb2/free.c                                |  976 -----
 lib/tdb2/hash.c                                |  894 -----
 lib/tdb2/io.c                                  |  650 ----
 lib/tdb2/lock.c                                |  883 -----
 lib/tdb2/open.c                                |  768 ----
 lib/tdb2/private.h                             |  657 ----
 lib/tdb2/pytdb.c                               |  591 ---
 lib/tdb2/summary.c                             |  330 --
 lib/tdb2/tdb.c                                 |  605 ---
 lib/tdb2/tdb.pc.in                             |   11 -
 lib/tdb2/tdb2.h                                |  897 -----
 lib/tdb2/test/api-12-store.c                   |   57 -
 lib/tdb2/test/api-13-delete.c                  |  205 --
 lib/tdb2/test/api-14-exists.c                  |   54 -
 lib/tdb2/test/api-16-wipe_all.c                |   46 -
 lib/tdb2/test/api-21-parse_record.c            |   67 -
 lib/tdb2/test/api-55-transaction.c             |   73 -
 lib/tdb2/test/api-80-tdb_fd.c                  |   32 -
 lib/tdb2/test/api-81-seqnum.c                  |   69 -
 lib/tdb2/test/api-82-lockattr.c                |  237 --
 lib/tdb2/test/api-83-openhook.c                |   96 -
 lib/tdb2/test/api-91-get-stats.c               |   57 -
 lib/tdb2/test/api-92-get-set-readonly.c        |  105 -
 lib/tdb2/test/api-93-repack.c                  |   80 -
 lib/tdb2/test/api-add-remove-flags.c           |   89 -
 lib/tdb2/test/api-check-callback.c             |   86 -
 lib/tdb2/test/api-firstkey-nextkey.c           |  159 -
 lib/tdb2/test/api-fork-test.c                  |  179 -
 lib/tdb2/test/api-locktimeout.c                |  193 -
 lib/tdb2/test/api-missing-entries.c            |   44 -
 lib/tdb2/test/api-open-multiple-times.c        |   83 -
 lib/tdb2/test/api-record-expand.c              |   51 -
 lib/tdb2/test/api-simple-delete.c              |   39 -
 lib/tdb2/test/api-summary.c                    |   58 -
 lib/tdb2/test/external-agent.c                 |  252 --
 lib/tdb2/test/external-agent.h                 |   51 -
 lib/tdb2/test/failtest_helper.c                |   96 -
 lib/tdb2/test/failtest_helper.h                |   19 -
 lib/tdb2/test/helpapi-external-agent.c         |    7 -
 lib/tdb2/test/helprun-external-agent.c         |    7 -
 lib/tdb2/test/helprun-layout.c                 |  402 --
 lib/tdb2/test/layout.h                         |   87 -
 lib/tdb2/test/lock-tracking.c                  |  147 -
 lib/tdb2/test/lock-tracking.h                  |   25 -
 lib/tdb2/test/logging.c                        |   30 -
 lib/tdb2/test/logging.h                        |   17 -
 lib/tdb2/test/run-001-encode.c                 |   41 -
 lib/tdb2/test/run-001-fls.c                    |   33 -
 lib/tdb2/test/run-01-new_database.c            |   34 -
 lib/tdb2/test/run-02-expand.c                  |   62 -
 lib/tdb2/test/run-03-coalesce.c                |  178 -
 lib/tdb2/test/run-04-basichash.c               |  260 --
 lib/tdb2/test/run-05-readonly-open.c           |   71 -
 lib/tdb2/test/run-10-simple-store.c            |   58 -
 lib/tdb2/test/run-11-simple-fetch.c            |   58 -
 lib/tdb2/test/run-12-check.c                   |   46 -
 lib/tdb2/test/run-15-append.c                  |  130 -
 lib/tdb2/test/run-20-growhash.c                |  137 -
 lib/tdb2/test/run-25-hashoverload.c            |  113 -
 lib/tdb2/test/run-30-exhaust-before-expand.c   |   71 -
 lib/tdb2/test/run-35-convert.c                 |   54 -
 lib/tdb2/test/run-50-multiple-freelists.c      |   70 -
 lib/tdb2/test/run-56-open-during-transaction.c |  165 -
 lib/tdb2/test/run-57-die-during-transaction.c  |  293 --
 lib/tdb2/test/run-64-bit-tdb.c                 |   72 -
 lib/tdb2/test/run-90-get-set-attributes.c      |  159 -
 lib/tdb2/test/run-capabilities.c               |  271 --
 lib/tdb2/test/run-expand-in-transaction.c      |   36 -
 lib/tdb2/test/run-features.c                   |   62 -
 lib/tdb2/test/run-lockall.c                    |   71 -
 lib/tdb2/test/run-remap-in-read_traverse.c     |   57 -
 lib/tdb2/test/run-seed.c                       |   61 -
 lib/tdb2/test/run-tdb_errorstr.c               |   52 -
 lib/tdb2/test/run-tdb_foreach.c                |   86 -
 lib/tdb2/test/run-traverse.c                   |  203 -
 lib/tdb2/test/tap-interface.c                  |    3 -
 lib/tdb2/test/tap-interface.h                  |   41 -
 lib/tdb2/test/tdb2-source.h                    |   11 -
 lib/tdb2/tools/Makefile                        |   16 -
 lib/tdb2/tools/growtdb-bench.c                 |  114 -
 lib/tdb2/tools/mktdb2.c                        |   29 -
 lib/tdb2/tools/speed.c                         |  443 ---
 lib/tdb2/tools/tdb2backup.c                    |  340 --
 lib/tdb2/tools/tdb2dump.c                      |  122 -
 lib/tdb2/tools/tdb2restore.c                   |  231 --
 lib/tdb2/tools/tdb2tool.c                      |  810 ----
 lib/tdb2/tools/tdb2torture.c                   |  529 ---
 lib/tdb2/transaction.c                         | 1322 -------
 lib/tdb2/traverse.c                            |   99 -
 lib/tdb2/wscript                               |  278 --
 script/autobuild.py                            |   11 +-
 206 files changed, 28850 insertions(+), 28992 deletions(-)
 create mode 100644 lib/ntdb/ABI/ntdb-0.9.sigs
 create mode 100644 lib/ntdb/LICENSE
 create mode 100644 lib/ntdb/Makefile
 create mode 100644 lib/ntdb/check.c
 create mode 100755 lib/ntdb/configure
 create mode 100644 lib/ntdb/doc/TDB_porting.txt
 create mode 100644 lib/ntdb/doc/design-1.3.txt
 create mode 100644 lib/ntdb/doc/design.lyx
 create mode 100644 lib/ntdb/doc/design.lyx,v
 create mode 100644 lib/ntdb/doc/design.pdf
 create mode 100644 lib/ntdb/doc/design.txt
 create mode 100644 lib/ntdb/free.c
 create mode 100644 lib/ntdb/hash.c
 create mode 100644 lib/ntdb/io.c
 create mode 100644 lib/ntdb/lock.c
 create mode 100644 lib/ntdb/ntdb.c
 create mode 100644 lib/ntdb/ntdb.h
 create mode 100644 lib/ntdb/ntdb.pc.in
 create mode 100644 lib/ntdb/open.c
 create mode 100644 lib/ntdb/private.h
 create mode 100644 lib/ntdb/pyntdb.c
 create mode 100644 lib/ntdb/summary.c
 create mode 100644 lib/ntdb/test/api-12-store.c
 create mode 100644 lib/ntdb/test/api-13-delete.c
 create mode 100644 lib/ntdb/test/api-14-exists.c
 create mode 100644 lib/ntdb/test/api-16-wipe_all.c
 create mode 100644 lib/ntdb/test/api-21-parse_record.c
 create mode 100644 lib/ntdb/test/api-55-transaction.c
 create mode 100644 lib/ntdb/test/api-80-tdb_fd.c
 create mode 100644 lib/ntdb/test/api-81-seqnum.c
 create mode 100644 lib/ntdb/test/api-82-lockattr.c
 create mode 100644 lib/ntdb/test/api-83-openhook.c
 create mode 100644 lib/ntdb/test/api-91-get-stats.c
 create mode 100644 lib/ntdb/test/api-92-get-set-readonly.c
 create mode 100644 lib/ntdb/test/api-93-repack.c
 create mode 100644 lib/ntdb/test/api-add-remove-flags.c
 create mode 100644 lib/ntdb/test/api-check-callback.c
 create mode 100644 lib/ntdb/test/api-firstkey-nextkey.c
 create mode 100644 lib/ntdb/test/api-fork-test.c
 create mode 100644 lib/ntdb/test/api-locktimeout.c
 create mode 100644 lib/ntdb/test/api-missing-entries.c
 create mode 100644 lib/ntdb/test/api-open-multiple-times.c
 create mode 100644 lib/ntdb/test/api-record-expand.c
 create mode 100644 lib/ntdb/test/api-simple-delete.c
 create mode 100644 lib/ntdb/test/api-summary.c
 create mode 100644 lib/ntdb/test/external-agent.c
 create mode 100644 lib/ntdb/test/external-agent.h
 create mode 100644 lib/ntdb/test/failtest_helper.c
 create mode 100644 lib/ntdb/test/failtest_helper.h
 create mode 100644 lib/ntdb/test/helpapi-external-agent.c
 create mode 100644 lib/ntdb/test/helprun-external-agent.c
 create mode 100644 lib/ntdb/test/helprun-layout.c
 create mode 100644 lib/ntdb/test/layout.h
 create mode 100644 lib/ntdb/test/lock-tracking.c
 create mode 100644 lib/ntdb/test/lock-tracking.h
 create mode 100644 lib/ntdb/test/logging.c
 create mode 100644 lib/ntdb/test/logging.h
 create mode 100644 lib/ntdb/test/ntdb-source.h
 create mode 100644 lib/ntdb/test/run-001-encode.c
 create mode 100644 lib/ntdb/test/run-001-fls.c
 create mode 100644 lib/ntdb/test/run-01-new_database.c
 create mode 100644 lib/ntdb/test/run-02-expand.c
 create mode 100644 lib/ntdb/test/run-03-coalesce.c
 create mode 100644 lib/ntdb/test/run-04-basichash.c
 create mode 100644 lib/ntdb/test/run-05-readonly-open.c
 create mode 100644 lib/ntdb/test/run-10-simple-store.c
 create mode 100644 lib/ntdb/test/run-11-simple-fetch.c
 create mode 100644 lib/ntdb/test/run-12-check.c
 create mode 100644 lib/ntdb/test/run-15-append.c
 create mode 100644 lib/ntdb/test/run-20-growhash.c
 create mode 100644 lib/ntdb/test/run-25-hashoverload.c
 create mode 100644 lib/ntdb/test/run-30-exhaust-before-expand.c
 create mode 100644 lib/ntdb/test/run-35-convert.c
 create mode 100644 lib/ntdb/test/run-50-multiple-freelists.c
 create mode 100644 lib/ntdb/test/run-56-open-during-transaction.c
 create mode 100644 lib/ntdb/test/run-57-die-during-transaction.c
 create mode 100644 lib/ntdb/test/run-64-bit-tdb.c
 create mode 100644 lib/ntdb/test/run-90-get-set-attributes.c
 create mode 100644 lib/ntdb/test/run-capabilities.c
 create mode 100644 lib/ntdb/test/run-expand-in-transaction.c
 create mode 100644 lib/ntdb/test/run-features.c
 create mode 100644 lib/ntdb/test/run-lockall.c
 create mode 100644 lib/ntdb/test/run-remap-in-read_traverse.c
 create mode 100644 lib/ntdb/test/run-seed.c
 create mode 100644 lib/ntdb/test/run-tdb_errorstr.c
 create mode 100644 lib/ntdb/test/run-tdb_foreach.c
 create mode 100644 lib/ntdb/test/run-traverse.c
 create mode 100644 lib/ntdb/test/tap-interface.c
 create mode 100644 lib/ntdb/test/tap-interface.h
 create mode 100644 lib/ntdb/tools/Makefile
 create mode 100644 lib/ntdb/tools/growtdb-bench.c
 create mode 100644 lib/ntdb/tools/mkntdb.c
 create mode 100644 lib/ntdb/tools/ntdbbackup.c
 create mode 100644 lib/ntdb/tools/ntdbdump.c
 create mode 100644 lib/ntdb/tools/ntdbrestore.c
 create mode 100644 lib/ntdb/tools/ntdbtool.c
 create mode 100644 lib/ntdb/tools/ntdbtorture.c
 create mode 100644 lib/ntdb/tools/speed.c
 create mode 100644 lib/ntdb/transaction.c
 create mode 100644 lib/ntdb/traverse.c
 create mode 100644 lib/ntdb/wscript
 delete mode 100644 lib/tdb2/ABI/tdb-2.0.0.sigs
 delete mode 100644 lib/tdb2/ABI/tdb-2.0.1.sigs
 delete mode 100644 lib/tdb2/LICENSE
 delete mode 100644 lib/tdb2/Makefile
 delete mode 100644 lib/tdb2/TODO
 delete mode 100644 lib/tdb2/_info
 delete mode 100644 lib/tdb2/check.c
 delete mode 100755 lib/tdb2/configure
 delete mode 100644 lib/tdb2/doc/TDB1_porting.txt
 delete mode 100644 lib/tdb2/doc/design-1.3.txt
 delete mode 100644 lib/tdb2/doc/design.lyx
 delete mode 100644 lib/tdb2/doc/design.lyx,v
 delete mode 100644 lib/tdb2/doc/design.pdf
 delete mode 100644 lib/tdb2/doc/design.txt
 delete mode 100644 lib/tdb2/free.c
 delete mode 100644 lib/tdb2/hash.c
 delete mode 100644 lib/tdb2/io.c
 delete mode 100644 lib/tdb2/lock.c
 delete mode 100644 lib/tdb2/open.c
 delete mode 100644 lib/tdb2/private.h
 delete mode 100644 lib/tdb2/pytdb.c
 delete mode 100644 lib/tdb2/summary.c
 delete mode 100644 lib/tdb2/tdb.c
 delete mode 100644 lib/tdb2/tdb.pc.in
 delete mode 100644 lib/tdb2/tdb2.h
 delete mode 100644 lib/tdb2/test/api-12-store.c
 delete mode 100644 lib/tdb2/test/api-13-delete.c
 delete mode 100644 lib/tdb2/test/api-14-exists.c
 delete mode 100644 lib/tdb2/test/api-16-wipe_all.c
 delete mode 100644 lib/tdb2/test/api-21-parse_record.c
 delete mode 100644 lib/tdb2/test/api-55-transaction.c
 delete mode 100644 lib/tdb2/test/api-80-tdb_fd.c
 delete mode 100644 lib/tdb2/test/api-81-seqnum.c
 delete mode 100644 lib/tdb2/test/api-82-lockattr.c
 delete mode 100644 lib/tdb2/test/api-83-openhook.c
 delete mode 100644 lib/tdb2/test/api-91-get-stats.c
 delete mode 100644 lib/tdb2/test/api-92-get-set-readonly.c
 delete mode 100644 lib/tdb2/test/api-93-repack.c
 delete mode 100644 lib/tdb2/test/api-add-remove-flags.c
 delete mode 100644 lib/tdb2/test/api-check-callback.c
 delete mode 100644 lib/tdb2/test/api-firstkey-nextkey.c
 delete mode 100644 lib/tdb2/test/api-fork-test.c
 delete mode 100644 lib/tdb2/test/api-locktimeout.c
 delete mode 100644 lib/tdb2/test/api-missing-entries.c
 delete mode 100644 lib/tdb2/test/api-open-multiple-times.c
 delete mode 100644 lib/tdb2/test/api-record-expand.c
 delete mode 100644 lib/tdb2/test/api-simple-delete.c
 delete mode 100644 lib/tdb2/test/api-summary.c
 delete mode 100644 lib/tdb2/test/external-agent.c
 delete mode 100644 lib/tdb2/test/external-agent.h
 delete mode 100644 lib/tdb2/test/failtest_helper.c
 delete mode 100644 lib/tdb2/test/failtest_helper.h
 delete mode 100644 lib/tdb2/test/helpapi-external-agent.c
 delete mode 100644 lib/tdb2/test/helprun-external-agent.c
 delete mode 100644 lib/tdb2/test/helprun-layout.c
 delete mode 100644 lib/tdb2/test/layout.h
 delete mode 100644 lib/tdb2/test/lock-tracking.c
 delete mode 100644 lib/tdb2/test/lock-tracking.h
 delete mode 100644 lib/tdb2/test/logging.c
 delete mode 100644 lib/tdb2/test/logging.h
 delete mode 100644 lib/tdb2/test/run-001-encode.c
 delete mode 100644 lib/tdb2/test/run-001-fls.c
 delete mode 100644 lib/tdb2/test/run-01-new_database.c
 delete mode 100644 lib/tdb2/test/run-02-expand.c
 delete mode 100644 lib/tdb2/test/run-03-coalesce.c
 delete mode 100644 lib/tdb2/test/run-04-basichash.c
 delete mode 100644 lib/tdb2/test/run-05-readonly-open.c
 delete mode 100644 lib/tdb2/test/run-10-simple-store.c
 delete mode 100644 lib/tdb2/test/run-11-simple-fetch.c
 delete mode 100644 lib/tdb2/test/run-12-check.c
 delete mode 100644 lib/tdb2/test/run-15-append.c
 delete mode 100644 lib/tdb2/test/run-20-growhash.c
 delete mode 100644 lib/tdb2/test/run-25-hashoverload.c
 delete mode 100644 lib/tdb2/test/run-30-exhaust-before-expand.c
 delete mode 100644 lib/tdb2/test/run-35-convert.c
 delete mode 100644 lib/tdb2/test/run-50-multiple-freelists.c
 delete mode 100644 lib/tdb2/test/run-56-open-during-transaction.c
 delete mode 100644 lib/tdb2/test/run-57-die-during-transaction.c
 delete mode 100644 lib/tdb2/test/run-64-bit-tdb.c
 delete mode 100644 lib/tdb2/test/run-90-get-set-attributes.c
 delete mode 100644 lib/tdb2/test/run-capabilities.c
 delete mode 100644 lib/tdb2/test/run-expand-in-transaction.c
 delete mode 100644 lib/tdb2/test/run-features.c
 delete mode 100644 lib/tdb2/test/run-lockall.c
 delete mode 100644 lib/tdb2/test/run-remap-in-read_traverse.c
 delete mode 100644 lib/tdb2/test/run-seed.c
 delete mode 100644 lib/tdb2/test/run-tdb_errorstr.c
 delete mode 100644 lib/tdb2/test/run-tdb_foreach.c
 delete mode 100644 lib/tdb2/test/run-traverse.c
 delete mode 100644 lib/tdb2/test/tap-interface.c
 delete mode 100644 lib/tdb2/test/tap-interface.h
 delete mode 100644 lib/tdb2/test/tdb2-source.h
 delete mode 100644 lib/tdb2/tools/Makefile
 delete mode 100644 lib/tdb2/tools/growtdb-bench.c
 delete mode 100644 lib/tdb2/tools/mktdb2.c
 delete mode 100644 lib/tdb2/tools/speed.c
 delete mode 100644 lib/tdb2/tools/tdb2backup.c
 delete mode 100644 lib/tdb2/tools/tdb2dump.c
 delete mode 100644 lib/tdb2/tools/tdb2restore.c
 delete mode 100644 lib/tdb2/tools/tdb2tool.c
 delete mode 100644 lib/tdb2/tools/tdb2torture.c
 delete mode 100644 lib/tdb2/transaction.c
 delete mode 100644 lib/tdb2/traverse.c
 delete mode 100644 lib/tdb2/wscript

diff --git a/lib/ntdb/ABI/ntdb-0.9.sigs b/lib/ntdb/ABI/ntdb-0.9.sigs
new file mode 100644
index 0000000000..6dae18fb6c
--- /dev/null
+++ b/lib/ntdb/ABI/ntdb-0.9.sigs
@@ -0,0 +1,39 @@
+ntdb_add_flag: void (struct ntdb_context *, unsigned int)
+ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
+ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
+ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
+ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_close: int (struct ntdb_context *)
+ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_error: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_errorstr: const char *(enum NTDB_ERROR)
+ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
+ntdb_fd: int (const struct ntdb_context *)
+ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
+ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
+ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
+ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
+ntdb_get_flags: unsigned int (struct ntdb_context *)
+ntdb_get_seqnum: int64_t (struct ntdb_context *)
+ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_name: const char *(const struct ntdb_context *)
+ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
+ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
+ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
+ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
+ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
+ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
+ntdb_transaction_cancel: void (struct ntdb_context *)
+ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_unlockall: void (struct ntdb_context *)
+ntdb_unlockall_read: void (struct ntdb_context *)
+ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
+ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
diff --git a/lib/ntdb/LICENSE b/lib/ntdb/LICENSE
new file mode 100644
index 0000000000..cca7fc278f
--- /dev/null
+++ b/lib/ntdb/LICENSE
@@ -0,0 +1,165 @@
+		   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/lib/ntdb/Makefile b/lib/ntdb/Makefile
new file mode 100644
index 0000000000..ddd439d503
--- /dev/null
+++ b/lib/ntdb/Makefile
@@ -0,0 +1,67 @@
+# simple makefile wrapper to run waf
+
+WAF=WAF_MAKE=1 PATH=buildtools/bin:../../buildtools/bin:$$PATH waf
+
+all:
+	$(WAF) build
+
+install:
+	$(WAF) install
+
+uninstall:
+	$(WAF) uninstall
+
+test: FORCE
+	$(WAF) test $(TEST_OPTIONS)
+
+testenv:
+	$(WAF) test --testenv $(TEST_OPTIONS)
+
+quicktest:
+	$(WAF) test --quick $(TEST_OPTIONS)
+
+dist:
+	touch .tmplock
+	WAFLOCK=.tmplock $(WAF) dist
+
+distcheck:
+	touch .tmplock
+	WAFLOCK=.tmplock $(WAF) distcheck
+
+clean:
+	$(WAF) clean
+
+distclean:
+	$(WAF) distclean
+
+reconfigure: configure
+	$(WAF) reconfigure
+
+show_waf_options:
+	$(WAF) --help
+
+# some compatibility make targets
+everything: all
+
+testsuite: all
+
+.PHONY: check
+check: test
+
+torture: all
+
+# this should do an install as well, once install is finished
+installcheck: test
+
+etags:
+	$(WAF) etags
+
+ctags:
+	$(WAF) ctags
+
+pydoctor:
+	$(WAF) pydoctor
+
+bin/%:: FORCE
+	$(WAF) --targets=`basename $@`
+FORCE:
diff --git a/lib/ntdb/check.c b/lib/ntdb/check.c
new file mode 100644
index 0000000000..1c676c7d45
--- /dev/null
+++ b/lib/ntdb/check.c
@@ -0,0 +1,864 @@
+ /*
+   Trivial Database 2: free list/block handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/asearch/asearch.h>
+
+/* We keep an ordered array of offsets. */
+static bool append(ntdb_off_t **arr, size_t *num, ntdb_off_t off)
+{
+	ntdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(ntdb_off_t));
+	if (!new)
+		return false;
+	new[(*num)++] = off;
+	*arr = new;
+	return true;
+}
+
+static enum NTDB_ERROR check_header(struct ntdb_context *ntdb, ntdb_off_t *recovery,
+				   uint64_t *features, size_t *num_capabilities)
+{
+	uint64_t hash_test;
+	struct ntdb_header hdr;
+	enum NTDB_ERROR ecode;
+	ntdb_off_t off, next;
+
+	ecode = ntdb_read_convert(ntdb, 0, &hdr, sizeof(hdr));
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+	/* magic food should not be converted, so convert back. */
+	ntdb_convert(ntdb, hdr.magic_food, sizeof(hdr.magic_food));
+
+	hash_test = NTDB_HASH_MAGIC;
+	hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
+	if (hdr.hash_test != hash_test) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "check: hash test %llu should be %llu",
+				  (long long)hdr.hash_test,
+				  (long long)hash_test);
+	}
+
+	if (strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "check: bad magic '%.*s'",
+				  (unsigned)sizeof(hdr.magic_food),
+				  hdr.magic_food);
+	}
+
+	/* Features which are used must be a subset of features offered. */
+	if (hdr.features_used & ~hdr.features_offered) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "check: features used (0x%llx) which"
+				  " are not offered (0x%llx)",
+				  (long long)hdr.features_used,
+				  (long long)hdr.features_offered);
+	}
+
+	*features = hdr.features_offered;
+	*recovery = hdr.recovery;
+	if (*recovery) {
+		if (*recovery < sizeof(hdr)
+		    || *recovery > ntdb->file->map_size) {
+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+					  "ntdb_check:"
+					  " invalid recovery offset %zu",
+					  (size_t)*recovery);
+		}
+	}
+
+	for (off = hdr.capabilities; off && ecode == NTDB_SUCCESS; off = next) {
+		const struct ntdb_capability *cap;
+		enum NTDB_ERROR e;
+
+		cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
+		if (NTDB_PTR_IS_ERR(cap)) {
+			return NTDB_PTR_ERR(cap);
+		}
+
+		/* All capabilities are unknown. */
+		e = unknown_capability(ntdb, "ntdb_check", cap->type);
+		next = cap->next;
+		ntdb_access_release(ntdb, cap);
+		if (e)
+			return e;
+		(*num_capabilities)++;
+	}
+
+	/* Don't check reserved: they *can* be used later. */
+	return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR check_hash_tree(struct ntdb_context *ntdb,
+				      ntdb_off_t off, unsigned int group_bits,
+				      uint64_t hprefix,
+				      unsigned hprefix_bits,
+				      ntdb_off_t used[],
+				      size_t num_used,
+				      size_t *num_found,
+				      enum NTDB_ERROR (*check)(NTDB_DATA,
+							      NTDB_DATA, void *),
+				      void *data);
+
+static enum NTDB_ERROR check_hash_chain(struct ntdb_context *ntdb,
+				       ntdb_off_t off,
+				       uint64_t hash,
+				       ntdb_off_t used[],
+				       size_t num_used,
+				       size_t *num_found,
+				       enum NTDB_ERROR (*check)(NTDB_DATA,
+							       NTDB_DATA,
+							       void *),
+				       void *data)
+{
+	struct ntdb_used_record rec;
+	enum NTDB_ERROR ecode;
+
+	ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (rec_magic(&rec) != NTDB_CHAIN_MAGIC) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: Bad hash chain magic %llu",
+				  (long long)rec_magic(&rec));
+	}
+
+	if (rec_data_length(&rec) != sizeof(struct ntdb_chain)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check:"
+				  " Bad hash chain length %llu vs %zu",
+				  (long long)rec_data_length(&rec),
+				  sizeof(struct ntdb_chain));
+	}
+	if (rec_key_length(&rec) != 0) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: Bad hash chain key length %llu",
+				  (long long)rec_key_length(&rec));
+	}
+	if (rec_hash(&rec) != 0) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: Bad hash chain hash value %llu",
+				  (long long)rec_hash(&rec));
+	}
+
+	off += sizeof(rec);
+	ecode = check_hash_tree(ntdb, off, 0, hash, 64,
+				used, num_used, num_found, check, data);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	off = ntdb_read_off(ntdb, off + offsetof(struct ntdb_chain, next));
+	if (NTDB_OFF_IS_ERR(off)) {
+		return NTDB_OFF_TO_ERR(off);
+	}
+	if (off == 0)
+		return NTDB_SUCCESS;
+	(*num_found)++;
+	return check_hash_chain(ntdb, off, hash, used, num_used, num_found,
+				check, data);
+}
+
+static enum NTDB_ERROR check_hash_record(struct ntdb_context *ntdb,
+					ntdb_off_t off,
+					uint64_t hprefix,
+					unsigned hprefix_bits,
+					ntdb_off_t used[],
+					size_t num_used,
+					size_t *num_found,
+					enum NTDB_ERROR (*check)(NTDB_DATA,
+								NTDB_DATA,
+								void *),
+					void *data)
+{
+	struct ntdb_used_record rec;
+	enum NTDB_ERROR ecode;
+
+	if (hprefix_bits >= 64)
+		return check_hash_chain(ntdb, off, hprefix, used, num_used,
+					num_found, check, data);
+
+	ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (rec_magic(&rec) != NTDB_HTABLE_MAGIC) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: Bad hash table magic %llu",
+				  (long long)rec_magic(&rec));
+	}
+	if (rec_data_length(&rec)
+	    != sizeof(ntdb_off_t) << NTDB_SUBLEVEL_HASH_BITS) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check:"
+				  " Bad hash table length %llu vs %llu",
+				  (long long)rec_data_length(&rec),
+				  (long long)sizeof(ntdb_off_t)
+				  << NTDB_SUBLEVEL_HASH_BITS);
+	}
+	if (rec_key_length(&rec) != 0) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: Bad hash table key length %llu",
+				  (long long)rec_key_length(&rec));
+	}
+	if (rec_hash(&rec) != 0) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: Bad hash table hash value %llu",
+				  (long long)rec_hash(&rec));
+	}
+
+	off += sizeof(rec);
+	return check_hash_tree(ntdb, off,
+			       NTDB_SUBLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS,
+			       hprefix, hprefix_bits,
+			       used, num_used, num_found, check, data);
+}
+
+static int off_cmp(const ntdb_off_t *a, const ntdb_off_t *b)
+{
+	/* Can overflow an int. */
+	return *a > *b ? 1
+		: *a < *b ? -1
+		: 0;
+}
+
+static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
+{
+	*used += num;
+
+	return (h >> (64 - *used)) & ((1U << num) - 1);
+}
+
+static enum NTDB_ERROR check_hash_tree(struct ntdb_context *ntdb,
+				      ntdb_off_t off, unsigned int group_bits,
+				      uint64_t hprefix,
+				      unsigned hprefix_bits,
+				      ntdb_off_t used[],
+				      size_t num_used,
+				      size_t *num_found,
+				      enum NTDB_ERROR (*check)(NTDB_DATA,
+							      NTDB_DATA, void *),
+				      void *data)
+{
+	unsigned int g, b;
+	const ntdb_off_t *hash;
+	struct ntdb_used_record rec;
+	enum NTDB_ERROR ecode;
+
+	hash = ntdb_access_read(ntdb, off,
+			       sizeof(ntdb_off_t)
+			       << (group_bits + NTDB_HASH_GROUP_BITS),
+			       true);
+	if (NTDB_PTR_IS_ERR(hash)) {
+		return NTDB_PTR_ERR(hash);
+	}
+
+	for (g = 0; g < (1 << group_bits); g++) {
+		const ntdb_off_t *group = hash + (g << NTDB_HASH_GROUP_BITS);
+		for (b = 0; b < (1 << NTDB_HASH_GROUP_BITS); b++) {
+			unsigned int bucket, i, used_bits;
+			uint64_t h;
+			ntdb_off_t *p;
+			if (group[b] == 0)
+				continue;
+
+			off = group[b] & NTDB_OFF_MASK;
+			p = asearch(&off, used, num_used, off_cmp);
+			if (!p) {
+				ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						   NTDB_LOG_ERROR,
+						   "ntdb_check: Invalid offset"
+						   " %llu in hash",
+						   (long long)off);
+				goto fail;
+			}
+			/* Mark it invalid. */
+			*p ^= 1;
+			(*num_found)++;
+
+			if (hprefix_bits == 64) {
+				/* Chained entries are unordered. */
+				if (is_subhash(group[b])) {
+					ecode = NTDB_ERR_CORRUPT;
+					ntdb_logerr(ntdb, ecode,
+						   NTDB_LOG_ERROR,
+						   "ntdb_check: Invalid chain"
+						   " entry subhash");
+					goto fail;
+				}
+				h = hash_record(ntdb, off);
+				if (h != hprefix) {
+					ecode = NTDB_ERR_CORRUPT;
+					ntdb_logerr(ntdb, ecode,
+						   NTDB_LOG_ERROR,
+						   "check: bad hash chain"
+						   " placement"
+						   " 0x%llx vs 0x%llx",
+						   (long long)h,
+						   (long long)hprefix);
+					goto fail;
+				}
+				ecode = ntdb_read_convert(ntdb, off, &rec,
+							 sizeof(rec));
+				if (ecode != NTDB_SUCCESS) {
+					goto fail;
+				}
+				goto check;
+			}
+
+			if (is_subhash(group[b])) {
+				uint64_t subprefix;
+				subprefix = (hprefix
+				     << (group_bits + NTDB_HASH_GROUP_BITS))
+					+ g * (1 << NTDB_HASH_GROUP_BITS) + b;
+
+				ecode = check_hash_record(ntdb,
+					       group[b] & NTDB_OFF_MASK,
+					       subprefix,
+					       hprefix_bits
+						       + group_bits
+						       + NTDB_HASH_GROUP_BITS,
+					       used, num_used, num_found,
+					       check, data);
+				if (ecode != NTDB_SUCCESS) {
+					goto fail;
+				}
+				continue;
+			}
+			/* A normal entry */
+
+			/* Does it belong here at all? */
+			h = hash_record(ntdb, off);
+			used_bits = 0;
+			if (get_bits(h, hprefix_bits, &used_bits) != hprefix
+			    && hprefix_bits) {
+				ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						   NTDB_LOG_ERROR,
+						   "check: bad hash placement"
+						   " 0x%llx vs 0x%llx",
+						   (long long)h,
+						   (long long)hprefix);
+				goto fail;
+			}
+
+			/* Does it belong in this group? */
+			if (get_bits(h, group_bits, &used_bits) != g) {
+				ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						   NTDB_LOG_ERROR,
+						   "check: bad group %llu"
+						   " vs %u",
+						   (long long)h, g);
+				goto fail;
+			}
+
+			/* Are bucket bits correct? */
+			bucket = group[b] & NTDB_OFF_HASH_GROUP_MASK;
+			if (get_bits(h, NTDB_HASH_GROUP_BITS, &used_bits)
+			    != bucket) {
+				used_bits -= NTDB_HASH_GROUP_BITS;
+				ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						   NTDB_LOG_ERROR,
+						   "check: bad bucket %u vs %u",
+						   (unsigned)get_bits(h,
+							NTDB_HASH_GROUP_BITS,
+							&used_bits),
+						   bucket);
+				goto fail;
+			}
+
+			/* There must not be any zero entries between
+			 * the bucket it belongs in and this one! */
+			for (i = bucket;
+			     i != b;
+			     i = (i + 1) % (1 << NTDB_HASH_GROUP_BITS)) {
+				if (group[i] == 0) {
+					ecode = NTDB_ERR_CORRUPT;
+					ntdb_logerr(ntdb, ecode,
+						   NTDB_LOG_ERROR,
+						   "check: bad group placement"
+						   " %u vs %u",
+						   b, bucket);
+					goto fail;
+				}
+			}
+
+			ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+			if (ecode != NTDB_SUCCESS) {
+				goto fail;
+			}
+
+			/* Bottom bits must match header. */
+			if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
+				ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						   NTDB_LOG_ERROR,
+						   "ntdb_check: Bad hash magic"
+						   " at offset %llu"
+						   " (0x%llx vs 0x%llx)",
+						   (long long)off,
+						   (long long)h,
+						   (long long)rec_hash(&rec));
+				goto fail;
+			}
+
+		check:
+			if (check) {
+				NTDB_DATA k, d;
+				const unsigned char *kptr;
+
+				kptr = ntdb_access_read(ntdb,
+						       off + sizeof(rec),
+						       rec_key_length(&rec)
+						       + rec_data_length(&rec),
+						       false);
+				if (NTDB_PTR_IS_ERR(kptr)) {
+					ecode = NTDB_PTR_ERR(kptr);
+					goto fail;
+				}
+
+				k = ntdb_mkdata(kptr, rec_key_length(&rec));
+				d = ntdb_mkdata(kptr + k.dsize,
+					       rec_data_length(&rec));
+				ecode = check(k, d, data);
+				ntdb_access_release(ntdb, kptr);
+				if (ecode != NTDB_SUCCESS) {
+					goto fail;
+				}
+			}
+		}
+	}
+	ntdb_access_release(ntdb, hash);
+	return NTDB_SUCCESS;
+
+fail:
+	ntdb_access_release(ntdb, hash);
+	return ecode;
+}
+
+static enum NTDB_ERROR check_hash(struct ntdb_context *ntdb,
+				 ntdb_off_t used[],
+				 size_t num_used, size_t num_other_used,
+				 enum NTDB_ERROR (*check)(NTDB_DATA, NTDB_DATA, void *),
+				 void *data)
+{
+	/* Free tables and capabilities also show up as used. */
+	size_t num_found = num_other_used;
+	enum NTDB_ERROR ecode;
+
+	ecode = check_hash_tree(ntdb, offsetof(struct ntdb_header, hashtable),
+				NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS,
+				0, 0, used, num_used, &num_found,
+				check, data);
+	if (ecode == NTDB_SUCCESS) {
+		if (num_found != num_used) {
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+					   "ntdb_check: Not all entries"
+					   " are in hash");
+		}
+	}
+	return ecode;
+}
+
+static enum NTDB_ERROR check_free(struct ntdb_context *ntdb,
+				 ntdb_off_t off,
+				 const struct ntdb_free_record *frec,
+				 ntdb_off_t prev, unsigned int ftable,
+				 unsigned int bucket)
+{
+	enum NTDB_ERROR ecode;
+
+	if (frec_magic(frec) != NTDB_FREE_MAGIC) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: offset %llu bad magic 0x%llx",
+				  (long long)off,
+				  (long long)frec->magic_and_prev);
+	}
+	if (frec_ftable(frec) != ftable) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: offset %llu bad freetable %u",
+				  (long long)off, frec_ftable(frec));
+
+	}
+
+	ecode = ntdb->io->oob(ntdb, off,
+			     frec_len(frec)
+			     + sizeof(struct ntdb_used_record),
+			     false);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+	if (size_to_bucket(frec_len(frec)) != bucket) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: offset %llu in wrong bucket"
+				  " (%u vs %u)",
+				  (long long)off,
+				  bucket, size_to_bucket(frec_len(frec)));
+	}
+	if (prev && prev != frec_prev(frec)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: offset %llu bad prev"
+				  " (%llu vs %llu)",
+				  (long long)off,
+				  (long long)prev, (long long)frec_len(frec));
+	}
+	return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR check_free_table(struct ntdb_context *ntdb,
+				       ntdb_off_t ftable_off,
+				       unsigned ftable_num,
+				       ntdb_off_t fr[],
+				       size_t num_free,
+				       size_t *num_found)
+{
+	struct ntdb_freetable ft;
+	ntdb_off_t h;
+	unsigned int i;
+	enum NTDB_ERROR ecode;
+
+	ecode = ntdb_read_convert(ntdb, ftable_off, &ft, sizeof(ft));
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (rec_magic(&ft.hdr) != NTDB_FTABLE_MAGIC
+	    || rec_key_length(&ft.hdr) != 0
+	    || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
+	    || rec_hash(&ft.hdr) != 0) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: Invalid header on free table");
+	}
+
+	for (i = 0; i < NTDB_FREE_BUCKETS; i++) {
+		ntdb_off_t off, prev = 0, *p, first = 0;
+		struct ntdb_free_record f;
+
+		h = bucket_off(ftable_off, i);
+		for (off = ntdb_read_off(ntdb, h); off; off = f.next) {
+			if (NTDB_OFF_IS_ERR(off)) {
+				return NTDB_OFF_TO_ERR(off);
+			}
+			if (!first) {
+				off &= NTDB_OFF_MASK;
+				first = off;
+			}
+			ecode = ntdb_read_convert(ntdb, off, &f, sizeof(f));
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+			ecode = check_free(ntdb, off, &f, prev, ftable_num, i);
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+
+			/* FIXME: Check hash bits */
+			p = asearch(&off, fr, num_free, off_cmp);
+			if (!p) {
+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: Invalid offset"
+						  " %llu in free table",
+						  (long long)off);
+			}
+			/* Mark it invalid. */
+			*p ^= 1;
+			(*num_found)++;
+			prev = off;
+		}
+
+		if (first) {
+			/* Now we can check first back pointer. */
+			ecode = ntdb_read_convert(ntdb, first, &f, sizeof(f));
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+			ecode = check_free(ntdb, first, &f, prev, ftable_num, i);
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+		}
+	}
+	return NTDB_SUCCESS;
+}
+
+/* Slow, but should be very rare. */
+ntdb_off_t dead_space(struct ntdb_context *ntdb, ntdb_off_t off)
+{
+	size_t len;
+	enum NTDB_ERROR ecode;
+
+	for (len = 0; off + len < ntdb->file->map_size; len++) {
+		char c;
+		ecode = ntdb->io->tread(ntdb, off, &c, 1);
+		if (ecode != NTDB_SUCCESS) {
+			return NTDB_ERR_TO_OFF(ecode);
+		}
+		if (c != 0 && c != 0x43)
+			break;
+	}
+	return len;
+}
+
+static enum NTDB_ERROR check_linear(struct ntdb_context *ntdb,
+				   ntdb_off_t **used, size_t *num_used,
+				   ntdb_off_t **fr, size_t *num_free,
+				   uint64_t features, ntdb_off_t recovery)
+{
+	ntdb_off_t off;
+	ntdb_len_t len;
+	enum NTDB_ERROR ecode;
+	bool found_recovery = false;
+
+	for (off = sizeof(struct ntdb_header);
+	     off < ntdb->file->map_size;
+	     off += len) {
+		union {
+			struct ntdb_used_record u;
+			struct ntdb_free_record f;
+			struct ntdb_recovery_record r;
+		} rec;
+		/* r is larger: only get that if we need to. */
+		ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.f));
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+
+		/* If we crash after ftruncate, we can get zeroes or fill. */
+		if (rec.r.magic == NTDB_RECOVERY_INVALID_MAGIC
+		    || rec.r.magic ==  0x4343434343434343ULL) {
+			ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+			if (recovery == off) {
+				found_recovery = true;
+				len = sizeof(rec.r) + rec.r.max_len;
+			} else {
+				len = dead_space(ntdb, off);
+				if (NTDB_OFF_IS_ERR(len)) {
+					return NTDB_OFF_TO_ERR(len);
+				}
+				if (len < sizeof(rec.r)) {
+					return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+							  NTDB_LOG_ERROR,
+							  "ntdb_check: invalid"
+							  " dead space at %zu",
+							  (size_t)off);
+				}
+
+				ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
+					   "Dead space at %zu-%zu (of %zu)",
+					   (size_t)off, (size_t)(off + len),
+					   (size_t)ntdb->file->map_size);
+			}
+		} else if (rec.r.magic == NTDB_RECOVERY_MAGIC) {
+			ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+			if (recovery != off) {
+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: unexpected"
+						  " recovery record at offset"
+						  " %zu",
+						  (size_t)off);
+			}
+			if (rec.r.len > rec.r.max_len) {
+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: invalid recovery"
+						  " length %zu",
+						  (size_t)rec.r.len);
+			}
+			if (rec.r.eof > ntdb->file->map_size) {
+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: invalid old EOF"
+						  " %zu", (size_t)rec.r.eof);
+			}
+			found_recovery = true;
+			len = sizeof(rec.r) + rec.r.max_len;
+		} else if (frec_magic(&rec.f) == NTDB_FREE_MAGIC) {
+			len = sizeof(rec.u) + frec_len(&rec.f);
+			if (off + len > ntdb->file->map_size) {
+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: free overlength"
+						  " %llu at offset %llu",
+						  (long long)len,
+						  (long long)off);
+			}
+			/* This record should be in free lists. */
+			if (frec_ftable(&rec.f) != NTDB_FTABLE_NONE
+			    && !append(fr, num_free, off)) {
+				return ntdb_logerr(ntdb, NTDB_ERR_OOM,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: tracking %zu'th"
+						  " free record.", *num_free);
+			}
+		} else if (rec_magic(&rec.u) == NTDB_USED_MAGIC
+			   || rec_magic(&rec.u) == NTDB_CHAIN_MAGIC
+			   || rec_magic(&rec.u) == NTDB_HTABLE_MAGIC
+			   || rec_magic(&rec.u) == NTDB_FTABLE_MAGIC
+			   || rec_magic(&rec.u) == NTDB_CAP_MAGIC) {
+			uint64_t klen, dlen, extra;
+
+			/* This record is used! */
+			if (!append(used, num_used, off)) {
+				return ntdb_logerr(ntdb, NTDB_ERR_OOM,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: tracking %zu'th"
+						  " used record.", *num_used);
+			}
+
+			klen = rec_key_length(&rec.u);
+			dlen = rec_data_length(&rec.u);
+			extra = rec_extra_padding(&rec.u);
+
+			len = sizeof(rec.u) + klen + dlen + extra;
+			if (off + len > ntdb->file->map_size) {
+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: used overlength"
+						  " %llu at offset %llu",
+						  (long long)len,
+						  (long long)off);
+			}
+
+			if (len < sizeof(rec.f)) {
+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						  NTDB_LOG_ERROR,
+						  "ntdb_check: too short record"
+						  " %llu at %llu",
+						  (long long)len,
+						  (long long)off);
+			}
+
+			/* Check that records have correct 0 at end (but may
+			 * not in future). */
+			if (extra && !features
+			    && rec_magic(&rec.u) != NTDB_CAP_MAGIC) {
+				const char *p;
+				char c;
+				p = ntdb_access_read(ntdb, off + sizeof(rec.u)
+						    + klen + dlen, 1, false);
+				if (NTDB_PTR_IS_ERR(p))
+					return NTDB_PTR_ERR(p);
+				c = *p;
+				ntdb_access_release(ntdb, p);
+
+				if (c != '\0') {
+					return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+							  NTDB_LOG_ERROR,
+							  "ntdb_check:"
+							  " non-zero extra"
+							  " at %llu",
+							  (long long)off);
+				}
+			}
+		} else {
+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+					  NTDB_LOG_ERROR,
+					  "ntdb_check: Bad magic 0x%llx"
+					  " at offset %zu",
+					  (long long)rec_magic(&rec.u),
+					  (size_t)off);
+		}
+	}
+
+	/* We must have found recovery area if there was one. */
+	if (recovery != 0 && !found_recovery) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_check: expected a recovery area at %zu",
+				  (size_t)recovery);
+	}
+
+	return NTDB_SUCCESS;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_check_(struct ntdb_context *ntdb,
+			  enum NTDB_ERROR (*check)(NTDB_DATA, NTDB_DATA, void *),
+			  void *data)
+{
+	ntdb_off_t *fr = NULL, *used = NULL, ft, recovery;
+	size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
+		num_capabilities = 0;
+	uint64_t features;
+	enum NTDB_ERROR ecode;
+
+	if (ntdb->flags & NTDB_CANT_CHECK) {
+		return ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
+				  "ntdb_check: database has unknown capability,"
+				  " cannot check.");
+	}
+
+	ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb->last_error = ecode;
+	}
+
+	ecode = ntdb_lock_expand(ntdb, F_RDLCK);
+	if (ecode != NTDB_SUCCESS) {
+		ntdb_allrecord_unlock(ntdb, F_RDLCK);
+		return ntdb->last_error = ecode;
+	}
+
+	ecode = check_header(ntdb, &recovery, &features, &num_capabilities);
+	if (ecode != NTDB_SUCCESS)
+		goto out;
+
+	/* First we do a linear scan, checking all records. */
+	ecode = check_linear(ntdb, &used, &num_used, &fr, &num_free, features,
+			     recovery);
+	if (ecode != NTDB_SUCCESS)
+		goto out;
+
+	for (ft = first_ftable(ntdb); ft; ft = next_ftable(ntdb, ft)) {
+		if (NTDB_OFF_IS_ERR(ft)) {
+			ecode = NTDB_OFF_TO_ERR(ft);
+			goto out;
+		}
+		ecode = check_free_table(ntdb, ft, num_ftables, fr, num_free,
+					 &num_found);
+		if (ecode != NTDB_SUCCESS)
+			goto out;
+		num_ftables++;
+	}
+
+	/* FIXME: Check key uniqueness? */
+	ecode = check_hash(ntdb, used, num_used, num_ftables + num_capabilities,
+			   check, data);
+	if (ecode != NTDB_SUCCESS)
+		goto out;
+
+	if (num_found != num_free) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				   "ntdb_check: Not all entries are in"
+				   " free table");
+	}
+
+out:
+	ntdb_allrecord_unlock(ntdb, F_RDLCK);
+	ntdb_unlock_expand(ntdb, F_RDLCK);
+	free(fr);
+	free(used);
+	return ntdb->last_error = ecode;
+}
diff --git a/lib/ntdb/configure b/lib/ntdb/configure
new file mode 100755
index 0000000000..6a9f875511
--- /dev/null
+++ b/lib/ntdb/configure
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+PREVPATH=`dirname $0`
+
+if [ -f $PREVPATH/../../buildtools/bin/waf ]; then
+	WAF=../../buildtools/bin/waf
+elif [ -f $PREVPATH/buildtools/bin/waf ]; then
+	WAF=./buildtools/bin/waf
+else
+	echo "replace: Unable to find waf"
+	exit 1
+fi
+
+# using JOBS=1 gives maximum compatibility with
+# systems like AIX which have broken threading in python
+JOBS=1
+export JOBS
+
+cd . || exit 1
+$WAF configure "$@" || exit 1
+cd $PREVPATH
diff --git a/lib/ntdb/doc/TDB_porting.txt b/lib/ntdb/doc/TDB_porting.txt
new file mode 100644
index 0000000000..8df137416d
--- /dev/null
+++ b/lib/ntdb/doc/TDB_porting.txt
@@ -0,0 +1,65 @@
+Interface differences between TDB and NTDB.
+
+- ntdb shares 'struct TDB_DATA' with tdb, but TDB defines the TDB_DATA
+  typedef, whereas ntdb defines NTDB_DATA (ie. both are compatible).
+  If you include both ntdb.h and tdb.h, #include tdb.h first,
+  otherwise you'll get a compile error when tdb.h re-defined struct
+  TDB_DATA.
+
+- ntdb functions return NTDB_SUCCESS (ie 0) on success, and a negative
+  error on failure, whereas tdb functions returned 0 on success, and
+  -1 on failure.  tdb then used tdb_error() to determine the error;
+  this is also supported in ntdb to ease backwards compatibility,
+  though the other form is preferred.
+
+- ntdb's ntdb_fetch() returns an error, tdb's returned the data directly
+  (or tdb_null, and you were supposed to check tdb_error() to find out why).
+
+- ntdb's ntdb_nextkey() frees the old key's dptr, in tdb you needed to do
+  this manually.
+
+- tdb's tdb_open/tdb_open_ex took an explicit hash size.  ntdb's hash table
+  resizes as required.
+
+- ntdb uses a linked list of attribute structures to implement logging and
+  alternate hashes.  tdb used tdb_open_ex, which was not extensible.
+
+- ntdb does locking on read-only databases (ie. O_RDONLY passed to ntdb_open).
+  tdb did not: use the NTDB_NOLOCK flag if you want to suppress locking.
+
+- ntdb's log function is simpler than tdb's log function.  The string is
+  already formatted, and it takes an enum ntdb_log_level not a tdb_debug_level,
+  and which has only three values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and
+  NTDB_LOG_WARNING.
+
+- ntdb provides ntdb_deq() for comparing two NTDB_DATA, and ntdb_mkdata() for
+  creating an NTDB_DATA.
+
+- ntdb's ntdb_name() returns a copy of the name even for NTDB_INTERNAL dbs.
+
+- ntdb does not need tdb_reopen() or tdb_reopen_all().  If you call
+  fork() after during certain operations the child should close the
+  tdb, or complete the operations before continuing to use the tdb:
+
+	ntdb_transaction_start(): child must ntdb_transaction_cancel()
+	ntdb_lockall(): child must call ntdb_unlockall()
+	ntdb_lockall_read(): child must call ntdb_unlockall_read()
+	ntdb_chainlock(): child must call ntdb_chainunlock()
+	ntdb_parse() callback: child must return from ntdb_parse()
+
+- ntdb will not open a non-tdb file, even if O_CREAT is specified.
+
+- There is no ntdb_traverse_read.  For operating on TDB files, you can
+  simulate it by ntdb_add_flag(tdb, NTDB_RDONLY); ntdb_traverse();
+  ntdb_remove_flag(tdb, NTDB_RDONLY).  This may be desirable because
+  traverse on TDB files use a write lock on the entire database
+  unless it's read-only.
+
+- Failure inside a transaction (such as a lock function failing) does
+  not implicitly cancel the transaction; you still need to call
+  ntdb_transaction_cancel().
+
+- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
+  API problems.  If necessary, you can emulate this by using the open
+  hook and placing a 1-byte lock at offset 4.  If your program forks,
+  you will need to place this lock again in the child.
diff --git a/lib/ntdb/doc/design-1.3.txt b/lib/ntdb/doc/design-1.3.txt
new file mode 100644
index 0000000000..f81ecf7885
--- /dev/null
+++ b/lib/ntdb/doc/design-1.3.txt
@@ -0,0 +1,1049 @@
+TDB2: A Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+27-April-2010
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases
+heading towards the 4G limit, that must change. This required
+breakage provides an opportunity to revisit TDB's other design
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as
+a simple key/data pair storage system with the same API as dbm,
+but allowing multiple readers and writers while being small
+enough (< 1000 lines of C) to include in SAMBA. The simple design
+created in 1999 has proven surprisingly robust and performant,
+used in Samba versions 3 and 4 as well as numerous other
+projects. Its useful life was greatly increased by the
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead
+to some organic growth of the API, as well as some compromises on
+the implementation. None of these, by themselves, are seen as
+show-stoppers, but the cumulative effect is to a loss of elegance
+over the initial, simple TDB implementation. Here is a table of
+the approximate number of lines of implementation code and number
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End  | API Functions  | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+|   1999    |      13        |              1195              |
++-----------+----------------+--------------------------------+
+|   2000    |      24        |              1725              |
++-----------+----------------+--------------------------------+
+|   2001    |      32        |              2228              |
++-----------+----------------+--------------------------------+
+|   2002    |      35        |              2481              |
++-----------+----------------+--------------------------------+
+|   2003    |      35        |              2552              |
++-----------+----------------+--------------------------------+
+|   2004    |      40        |              2584              |
++-----------+----------------+--------------------------------+
+|   2005    |      38        |              2647              |
++-----------+----------------+--------------------------------+
+|   2006    |      52        |              3754              |
++-----------+----------------+--------------------------------+
+|   2007    |      66        |              4398              |
++-----------+----------------+--------------------------------+
+|   2008    |      71        |              4768              |
++-----------+----------------+--------------------------------+
+|   2009    |      73        |              5715              |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known
+issues with TDB and create solutions which address the problems
+without significantly increasing complexity; all involved are far
+too aware of the dangers of second system syndrome in rewriting a
+successful project like this.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an
+optional hashing function and an optional logging function
+argument. Additional arguments to open would require the
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+    TDB_ATTRIBUTE_LOG = 0,
+
+    TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+    enum tdb_attribute attr;
+
+    union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
+*/
+
+    tdb_log_func log_fn;
+
+    void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
+*/
+
+    tdb_hash_func hash_fn;
+
+    void *hash_private;
+
+};
+
+union tdb_attribute {
+
+    struct tdb_attribute_base base;
+
+    struct tdb_attribute_log log;
+
+    struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands
+the size of the union.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
+and it was thought that it was important to guarantee that all
+records which exist at the start and end of the traversal would
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not
+work anyway for records which are altered (in particular, those
+which are expanded may be effectively deleted and re-added behind
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes
+occur during your traversal, otherwise you will see some subset.
+You can prevent changes by using a transaction or the locking
+API.
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not
+allowing them. Various paths in the Samba codebase assume that
+transactions will nest, and in a sense they can: the operation is
+only committed to disk when the outer transaction is committed.
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer
+  transaction commit to fail, and will not undo any operations
+  since the inner transaction began. This problem is soluble with
+  some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer
+  transaction. This is desirable in the way which Samba's
+  database initialization code uses transactions, but could be a
+  surprise to any users expecting a successful transaction commit
+  to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(),
+with the default currently that nested transactions are allowed.
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the “least-surprise”
+behavior of disallowing nested transactions should become the
+default. Additionally, it seems the outer transaction is the only
+code which knows whether inner transactions should be allowed, so
+a flag to indicate this could be added to tdb_transaction_start.
+However, this behavior can be simulated with a wrapper which uses
+tdb_add_flags() and tdb_remove_flags(), so the API should not be
+expanded for this relatively-obscure case.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash
+function to use, but does not check that all other processes
+accessing this tdb are using the same hash function. The result
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of
+0xdeadbeef), and tdb_open_ex() should check that the given hash
+function produces the same answer, or fail the tdb_open call.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB:
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
+latter actually calls the former with an argument of “5”.
+
+This code allows deleted records to accumulate without putting
+them in the free list. On delete we iterate through each chain
+and free them in a batch if there are more than max_dead entries.
+These are never otherwise recycled except as a side-effect of a
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API
+can be removed. The TDB_VOLATILE flag may still be useful as a
+hint that store and delete of records will be at least as common
+as fetch in order to allow some internal tuning, but initially
+will become a no-op.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
+  In The Same Process
+
+No process can open the same TDB twice; we check and disallow it.
+This is an unfortunate side-effect of fcntl locks, which operate
+on a per-file rather than per-file-descriptor basis, and do not
+nest. Thus, closing any file descriptor on a file clears all the
+locks obtained by this process, even if they were placed using a
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if
+operations were nested: this is a more manageable programming
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby
+Linux to violate them so that the most common implementation does
+not have this restriction. This would be a generally good idea
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to
+multiple callers if this happens, and does simple reference
+counting. We should do this inside the tdb library, which already
+emulates lock nesting internally; it would need to recognize when
+deadlock occurs within a single process. This would create a new
+failure mode for tdb operations (while we currently handle
+locking failures, they are impossible in normal use and a process
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate
+whether re-opening is allowed, as though there may be some
+benefit to adding a call to detect when a tdb_context is shared,
+to allow other to create such an API.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an
+operation to determine what went wrong. This programming model
+does not work with threads, unless specific additional guarantees
+are given by the implementation. In addition, even
+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a
+great deal of churn; we are better to guarantee that the
+tdb_errcode is per-thread so the current programming model can be
+maintained.
+
+This requires dynamic per-thread allocations, which is awkward
+with POSIX threads (pthread_key_create space is limited and we
+cannot simply allocate a key for every TDB).
+
+Internal locking is required to make sure that fcntl locks do not
+overlap between threads, and also that the global list of tdbs is
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a
+pthread-safe version of the library, and otherwise no overhead
+will exist.
+
+2.8 *_nonblock Functions And *_mark Functions Expose
+  Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg.
+  tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal
+  variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we
+  already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg.
+  tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg.
+  tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two
+new variants of each function clutters the API for an obscure
+use, and so not all functions have three variants. Secondly, it
+assumes that all paths of the functions ask for the same locks,
+otherwise the parent process will have to get a lock which the
+child doesn't have under some circumstances. I don't believe this
+is currently the case, but it constrains the implementation.
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can
+control the calls to create and remove fcntl locks. In this
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the
+  lock. Initially, this is always false. If so, return 0.
+  Otherwise, try to obtain it in non-blocking mode. If that
+  fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if
+  so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that
+  information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario,
+even when lock requirements change. It can be optimized so that
+the parent does not release locks, just tells the child which
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where
+it is needed.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record
+indicated by the given key. This gave atomicity guarantees;
+no-one can start a transaction, alter, read or delete that key
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain,
+which is an internal implementation detail and potentially a
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock
+which effected no other keys. Unfortunately, this won't work for
+an entry which doesn't exist. Thus while chainlock may be
+implemented more efficiently for the existing case, it will still
+have overlap issues with the non-existing case. So it is best to
+keep the current (lack of) guarantee about which records will be
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler
+to indicate that the tdb locking code should return with a
+failure, rather than trying again when a signal is received (and
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is
+received before the tdb code enters the fcntl() call to place the
+lock: the code will sleep within the fcntl() code, unaware that
+the signal wants it to exit. In the case of long timeouts, this
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook]
+would allow the user to decide on whether to fail the lock
+acquisition on a signal. This allows the caller to choose their
+own compromise: they could narrow the race by checking
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations
+by having the signal handler alter the struct flock to make it
+invalid. This will cause the fcntl() lock call to fail with
+EINVAL if the signal occurs before the kernel is entered,
+otherwise EAGAIN.
+]
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types
+can differ across implementations, or arguably in the case of
+function pointer definitions which are hard for humans to parse.
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and
+macros.
+
+  TDB_CONTEXT There is no reason to use this over 'struct
+  tdb_context'; the definition isn't visible to the API user
+  anyway.
+
+  TDB_DATA There is no reason to use this over struct TDB_DATA;
+  the struct needs to be understood by the API user.
+
+  struct TDB_DATA This would normally be called 'struct
+  tdb_data'.
+
+  enum TDB_ERROR Similarly, this would normally be enum
+  tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like
+myself, but if it were done the existing ones should be kept.
+There is little point forcing a purely cosmetic change upon tdb
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
+  Private Pointer
+
+For API compatibility reasons, the logging function needs to call
+tdb_get_logging_private() to retrieve the pointer registered by
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to
+break the API/ABI.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
+and tdb_check all take void * and must internally convert it to
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions
+which give a warning when used on gcc and the types of the
+callback and its private argument differ. Unsupported compilers
+will not give a warning, which is no worse than now. In addition,
+the callbacks become clearer, as they need not use void * for
+their parameter.
+
+See CCAN's typesafe_cb module at
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
+  tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
+file should be cleared if the caller discovers it is the only
+process with the TDB open. However, if any caller does not
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or
+otherwise closes the tdb) before the child calls tdb_reopen_all()
+to establish the lock used to indicate the TDB is opened by
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
+  Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
+never conflict in normal tdb usage, they do add substantial
+overhead for most fcntl lock implementations when the kernel
+scans to detect if a lock conflict exists. This is often a single
+linked list, making the time to acquire and release a fcntl lock
+O(N) where N is the number of processes with the TDB open, not
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients
+sitting idle, and thus they have weaned themselves off the
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this
+optimization: if the parent process will outlive the child, the
+child does not need the ACTIVE_LOCK. This is a workaround for
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers
+tend to know when they are initializing for the first time and
+can simply unlink the old tdb at that point.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for “trivial”!),
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets
+internally rather than 32 bit as now. For simplicity of endian
+conversion (which TDB does on the fly if required), all values
+will be 64 bit on disk. In practice, some upper bits may be used
+for other purposes, but at least 56 bits will be available for
+file offsets.
+
+tdb_open() will automatically detect the old version, and even
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G
+(assuming that their off_t allows them to seek to 64 bits), they
+will gracefully fall back as they fail to mmap. This can happen
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28
+August 2009, commit 398d0c29290: prior to that any unrecognized
+file format would be erased and initialized as a fresh tdb!)
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t
+which can be 64 bit on 64 bit platforms. However, other limits
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit
+platforms which try to access such records (the current
+implementation would return TDB_ERR_OOM in a similar case). It
+seems unlikely that 32 bit keys will be a limitation, so the
+implementation may not support this (see [sub:Records-Incur-A]).
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is
+specified at creation time, and defaults to 131. This is such a
+bottleneck on large databases (as each hash chain gets quite
+long), that LDB uses 10,000 for this hash. In general it is
+impossible to know what the 'right' answer is at database
+creation time.
+
+3.4.1 Proposed Solution
+
+After comprehensive performance testing on various scalable hash
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
+This was annoying because I was previously convinced that an
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash
+table which doubles in size when it reaches saturation. There are
+three details which become important:
+
+1. On encountering a full bucket, we use the next bucket.
+
+2. Extra hash bits are stored with the offset, to reduce
+  comparisons.
+
+3. A marker entry is used on deleting an entry.
+
+The doubling of the table must be done under a transaction; we
+will not reduce it on deletion, so it will be an unusual case. It
+will either be placed at the head (other entries will be moved
+out the way so we can expand). We could have a pointer in the
+header to the current hashtable location, but that pointer would
+have to be read frequently to check for hashtable moves.
+
+The locking for this is slightly more complex than the chained
+case; we currently have one lock per bucket, and that means we
+would need to expand the lock if we overflow to the next bucket.
+The frequency of such collisions will effect our locking
+heuristics: we can always lock more buckets than we need.
+
+One possible optimization is to only re-check the hash size on an
+insert or a lookup miss.
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but <
+  the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier,
+  return it. The slack will be turned into a new free record if
+  it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer
+  offset.
+
+3. If max_dead is non-zero:
+
+  (a) Walk the hash chain again and count the dead records.
+
+  (b) If it's more than max_dead, bulk free all the dead ones
+    (similar to steps 4 and below, but the lock is only obtained
+    once).
+
+  (c) Simply mark this record as dead and return.
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is
+  free; if so, enlarge the current block and remove that block
+  from the free list. This was disabled, as removal from the free
+  list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this
+  reason, each block has a 32-bit tailer which indicates its
+  length. If it is free, expand it to cover our new block and
+  return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step [right-merging]) causes
+fragmentation; the other heuristics proved insufficient to
+address this, so the final answer to this was that when we expand
+the TDB file inside a transaction commit, we repack the entire
+tdb.
+
+The single list lock limits our allocation rate; due to the other
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they
+obviously interact, then examine them once the lock contention is
+addressed.
+
+The free list must be split to reduce contention. Assuming
+perfect free merging, we can at most have 1 free list entry for
+each entry. This implies that the number of free lists is related
+to the size of the hash table, but as it is rare to walk a large
+number of free list entries we can use far fewer, say 1/32 of the
+number of hash buckets.
+
+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common
+case where all processes are allocating/freeing the same size.
+Thus we almost certainly need to divide in other ways: the most
+obvious is to divide the file into zones, and using a free list
+(or set of free lists) for each. This approximates address
+ordering.
+
+Note that this means we need to split the free lists when we
+expand the file; this is probably acceptable when we double the
+hash table size, since that is such an expensive operation
+already. In the case of increasing the file size, there is an
+optimization we can use: if we use M in the formula above as the
+file size rounded up to the next power of 2, we only need
+reshuffle free lists when the file size crosses a power of 2
+boundary, and reshuffling the free lists is trivial: we simply
+merge every consecutive pair of free lists.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct zone.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone (we didn't have a lock, sizes could have
+  changed): relock if necessary.
+
+4. Place the freed entry in the list for that zone.
+
+Allocation is a little more complicated, as we perform delayed
+coalescing at this point:
+
+1. Pick a zone either the zone we last freed into, or based on a “
+  random” number.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone: relock if necessary.
+
+4. If the top entry is -large enough, remove it from the list and
+  return it.
+
+5. Otherwise, coalesce entries in the list.
+
+  (a)
+
+  (b)
+
+  (c)
+
+  (d)
+
+6. If there was no entry large enough, unlock the list and try
+  the next zone.
+
+7.
+
+8.
+
+9. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not
+coalescing them all the time.. First-fit address ordering
+ordering seems to be fairly good for keeping fragmentation low
+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
+does not need a tailer to coalesce, though if we needed one we
+could have one cheaply: see [sub:Records-Incur-A].
+
+
+
+I anticipate that the number of entries in each free zone would
+be small, but it might be worth using one free entry to hold
+pointers to the others for cache efficiency.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation
+(aka overallocation) is deliberately set at 25%, and external
+fragmentation is only cured by the decision to repack the entire
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because
+indexes tend to expand by one record at a time. This internal
+fragmentation can be resolved by having an “expanded” bit in the
+header to note entries that have previously expanded, and
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external
+fragmentation: one is to use a fragmentation-avoiding allocation
+strategy such as best-fit address-order allocator. The other end
+of the spectrum would be to use a bump allocator (very fast and
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding
+allocators: they are non-trivial, they tend to use a single free
+list for each size, and there's no evidence that tdb allocation
+patterns will match those recorded for general allocators (though
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we
+will be no worse than the current code if we need to repack on
+occasion. More effort is spent on reducing freelist contention,
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+        tdb_off_t next; /* offset of the next record in the list
+*/
+
+        tdb_len_t rec_len; /* total byte length of record */
+
+        tdb_len_t key_len; /* byte length of key */
+
+        tdb_len_t data_len; /* byte length of data */
+
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+
+        uint32_t magic;   /* try to catch errors */
+
+        /* the following union is implied:
+
+                union {
+
+                        char record[rec_len];
+
+                        struct {
+
+                                char key[key_len];
+
+                                char data[data_len];
+
+                        }
+
+                        uint32_t totalsize; (tailer)
+
+                }
+
+        */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated
+block:
+
+1. The 'next' pointer is not required, as we are using a flat
+  hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len
+  and data_len (it accounts for wasted or overallocated length in
+  the record). Since the record length is always a multiple of 8,
+  we can conveniently fit it in 32 bits (representing up to 35
+  bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
+  restrict 'data_len' to 32 bits, but instead we can combine the
+  two into one 64-bit field and using a 5 bit value which
+  indicates at what bit to divide the two. Keys are unlikely to
+  scale as fast as data, so I'm assuming a maximum key size of 32
+  bits.
+
+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
+  this is diminishing returns after a handful of bits (at 10
+  bits, it reduces 99.9% of false memcmp). As an aside, as the
+  lower bits are already incorporated in the hash table
+  resolution, the upper bits should be used here.
+
+5. 'magic' does not need to be enlarged: it currently reflects
+  one of 5 values (used, free, dead, recovery, and
+  unused_recovery). It is useful for quick sanity checking
+  however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to
+  the right can find the header to check if this block is free).
+  This can be replaced by a single 'free' bit in the header of
+  the following block (and the tailer only exists in free
+  blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques.
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this,
+  however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+        uint32_t magic : 16,
+
+                 prev_is_free: 1,
+
+                 key_data_divide: 5,
+
+                 top_hash: 10;
+
+        uint32_t extra_octets;
+
+        uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+        uint32_t free_magic;
+
+        uint64_t total_length;
+
+        ...
+
+        uint64_t tailer;
+
+};
+
+
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync();
+
+On current ext3, each sync flushes all data to disk, so the next
+3 syncs are relatively expensive. But this could become a
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+
+
+
+
+
+
+
+
+Neil Brown points out that this is overzealous, and only one sync
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong
+  checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the
+  header (identified using the transaction counter) with the
+  offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a
+valid checksum and using the new data checksum to ensure that it
+has been applied. This is more expensive than the current check,
+but need only be done at open. For running databases, a separate
+header field can be used to indicate a transaction in progress;
+we need only check for recovery if this is set.
+
+3.9 TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed Solution
+
+None. At some point you say “use a real database”.
+
+But as a thought experiment, if we implemented transactions to
+only overwrite free entries (this is tricky: there must not be a
+header in each entry which indicates whether it is free, but use
+of presence in metadata elsewhere), and a pointer to the hash
+table, we could create an entirely new commit without destroying
+existing data. Then it would be easy to implement snapshots in a
+similar way.
+
+This would not allow arbitrary changes to the database, such as
+tdb_repack does, and would require more space (since we have to
+preserve the current and future entries at once). If we used hash
+trees rather than one big hash table, we might only have to
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using
+multiple different hash tables/free tables.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with
+just about every update. It would add significant complexity in
+resolving clashes, and cause the all transaction callers to write
+their code to loop in the case where the transactions spuriously
+failed.
+
+3.10.1 Proposed Solution
+
+We could solve a small part of the problem by providing read-only
+transactions. These would allow one write transaction to begin,
+but it could not commit until all r/o transactions are done. This
+would require a new RO_TRANSACTION_LOCK, which would be upgraded
+on commit.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow
+(especially if we expand it to 64 bits), and works best when the
+hash bucket size is a prime number (which also means a slow
+modulus). In addition, it is highly predictable which could
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux
+kernel and almost everything else. This has the particular
+properties that it takes an initial seed, and produces two 32 bit
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random
+source, and placed in the header. This is far from foolproof, but
+adds a little bit of protection against hash bombing.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that
+lock in the delete code. If that grab on delete fails, we simply
+mark it deleted and continue onwards; traversal checks for this
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see [traverse-Proposed-Solution].
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one.
+This is actually one reason why transactions can be faster
+(everything is locked once at transaction start). In the
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of
+TDB, and it didn't make much difference except in manufactured
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux),
+but it means that we lose automatic cleanup when a process dies
+with a lock. There is a method of auto-cleanup under Linux, but
+it's not supported by other operating systems. We could
+reintroduce a clear-if-first-style lock and sweep for dead
+futexes on open, but that wouldn't help the normal case of one
+concurrent opener dying. Increasingly elaborate repair schemes
+could be considered, but they require an ABI change (everyone
+must use them) anyway, so there's no need to do this at the same
+time as everything else.
diff --git a/lib/ntdb/doc/design.lyx b/lib/ntdb/doc/design.lyx
new file mode 100644
index 0000000000..0a1d6a14bc
--- /dev/null
+++ b/lib/ntdb/doc/design.lyx
@@ -0,0 +1,2689 @@
+#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author ""
+\author ""
+\end_header
+
+\begin_body
+
+\begin_layout Title
+TDB2: A Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+17-March-2011
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+    enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+    union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete; the nesting flag has been removed.
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Unknown flags cause tdb_open() to fail as well, so they can be detected
+ at runtime.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn, but fortunately most functions return 0 on success and -1
+ on error: we can change these to return 0 on success and a negative error
+ code on error, and the API remains similar to previous.
+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
+ pointer and return an error code.
+ It is also simpler to have tdb_nextkey replace its key argument in place,
+ freeing up any old .dptr.
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+ Alternatively, a hooking mechanism similar to that proposed for
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ could be used to enable pthread locking at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete; API has been changed but thread safety has not been implemented.
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Extending The Header Is Difficult
+\end_layout
+
+\begin_layout Standard
+We have reserved (zeroed) words in the TDB header, which can be used for
+ future features.
+ If the future features are compulsory, the version number must be updated
+ to prevent old code from accessing the database.
+ But if the future feature is optional, we have no way of telling if older
+ code is accessing the database or not.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain a
+\begin_inset Quotes eld
+\end_inset
+
+format variant
+\begin_inset Quotes erd
+\end_inset
+
+ value (64-bit).
+ This is divided into two 32-bit parts:
+\end_layout
+
+\begin_layout Enumerate
+The lower part reflects the format variant understood by code accessing
+ the database.
+\end_layout
+
+\begin_layout Enumerate
+The upper part reflects the format variant you must understand to write
+ to the database (otherwise you can only open for reading).
+\end_layout
+
+\begin_layout Standard
+The latter field can only be written at creation time, the former should
+ be written under the OPEN_LOCK when opening the database for writing, if
+ the variant of the code is lower than the current lowest variant.
+\end_layout
+
+\begin_layout Standard
+This should allow backwards-compatible features to be added, and detection
+ if older code (which doesn't understand the feature) writes to the database.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Record Headers Are Not Expandible
+\end_layout
+
+\begin_layout Standard
+If we later want to add (say) checksums on keys and data, it would require
+ another format change, which we'd like to avoid.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We often have extra padding at the tail of a record.
+ If we ensure that the first byte (if any) of this padding is zero, we will
+ have a way for future changes to detect code which doesn't understand a
+ new format: the new code would write (say) a 1 at the tail, and thus if
+ there is no tail or the first byte is 0, we would know the extension is
+ not present on that record.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Hash-Size-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
+\end_layout
+
+\begin_layout Standard
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
+
+\begin_layout Standard
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+It seems tempting to try to reuse the hash implementation which we use for
+ records here, but we have two ways of searching for free entries: for allocatio
+n we search by size (and possibly zone) which produces too many clashes
+ for our hash table to handle well, and for coalescing we search by address.
+ Thus an array of doubly-linked free lists seems preferable.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or table of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Unfortunately it is difficult to know what heuristics should be used to
+ determine zone sizes, and our transaction code relies on being able to
+ create a
+\begin_inset Quotes eld
+\end_inset
+
+recovery area
+\begin_inset Quotes erd
+\end_inset
+
+ by simply appending to the file (difficult if it would need to create a
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct free list.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the list (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a free table; usually the previous one.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
+\end_layout
+
+\begin_layout Enumerate
+If no list has an entry which meets our needs, try the next free table.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+
+\end_layout
+
+\begin_layout Standard
+Each free entry has the free table number in the header: less than 255.
+ It also contains a doubly-linked list for easy deletion.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic;   /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+        /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+                union {
+\end_layout
+
+\begin_layout LyX-Code
+                        char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        struct {
+\end_layout
+
+\begin_layout LyX-Code
+                                char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+                                char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        }
+\end_layout
+
+\begin_layout LyX-Code
+                        uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+                }
+\end_layout
+
+\begin_layout LyX-Code
+        */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+ Note that it's not clear that these bits will be a win, given the extra
+ bits in the hash table itself (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Hash-Size-Solution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t used_magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+                 key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+                 top_hash: 11;
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_magic: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                   prev : 56;
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                 total_length : 56
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t next;;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed SolutionNone.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+ (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\begin_layout Subsection
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Tridge points out that an attribute can be later added to tdb_open (see
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\end_body
+\end_document
diff --git a/lib/ntdb/doc/design.lyx,v b/lib/ntdb/doc/design.lyx,v
new file mode 100644
index 0000000000..13e6387f7f
--- /dev/null
+++ b/lib/ntdb/doc/design.lyx,v
@@ -0,0 +1,4679 @@
+head	1.13;
+access;
+symbols;
+locks; strict;
+comment	@# @;
+
+
+1.13
+date	2011.03.01.11.46.54;	author rusty;	state Exp;
+branches;
+next	1.12;
+
+1.12
+date	2010.12.01.12.20.49;	author rusty;	state Exp;
+branches;
+next	1.11;
+
+1.11
+date	2010.12.01.11.55.20;	author rusty;	state Exp;
+branches;
+next	1.10;
+
+1.10
+date	2010.09.14.00.33.57;	author rusty;	state Exp;
+branches;
+next	1.9;
+
+1.9
+date	2010.09.09.07.25.12;	author rusty;	state Exp;
+branches;
+next	1.8;
+
+1.8
+date	2010.09.02.02.29.05;	author rusty;	state Exp;
+branches;
+next	1.7;
+
+1.7
+date	2010.09.01.10.58.12;	author rusty;	state Exp;
+branches;
+next	1.6;
+
+1.6
+date	2010.08.02.00.21.43;	author rusty;	state Exp;
+branches;
+next	1.5;
+
+1.5
+date	2010.08.02.00.21.16;	author rusty;	state Exp;
+branches;
+next	1.4;
+
+1.4
+date	2010.05.10.13.09.11;	author rusty;	state Exp;
+branches;
+next	1.3;
+
+1.3
+date	2010.05.10.11.58.37;	author rusty;	state Exp;
+branches;
+next	1.2;
+
+1.2
+date	2010.05.10.05.35.13;	author rusty;	state Exp;
+branches;
+next	1.1;
+
+1.1
+date	2010.05.04.02.29.16;	author rusty;	state Exp;
+branches;
+next	;
+
+
+desc
+@First draft
+@
+
+
+1.13
+log
+@Thread-safe API
+@
+text
+@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author "Rusty Russell,,,"
+\author ""
+\end_header
+
+\begin_body
+
+\begin_layout Title
+TDB2: A Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+1-December-2010
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+    enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+    union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979572
+Incomplete; nesting flag is still defined as per tdb1.
+\change_inserted 0 1298979584
+Complete; the nesting flag has been removed.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+ TDB_VOLATILE still defined, but implementation should fail on unknown flags
+ to be future-proof.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn
+\change_inserted 0 1298979557
+, but fortunately most functions return 0 on success and -1 on error: we
+ can change these to return 0 on success and a negative error code on error,
+ and the API remains similar to previous.
+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
+ pointer and return an error code.
+ It is also simpler to have tdb_nextkey replace its key argument in place,
+ freeing up any old .dptr.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979438
+; we are better to guarantee that the tdb_errcode is per-thread so the current
+ programming model can be maintained.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979438
+This requires dynamic per-thread allocations, which is awkward with POSIX
+ threads (pthread_key_create space is limited and we cannot simply allocate
+ a key for every TDB).
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+ Alternatively, a hooking mechanism similar to that proposed for
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ could be used to enable pthread locking at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete
+\change_inserted 0 1298979681
+; API has been changed but thread safety has not been implemented.
+\change_deleted 0 1298979669
+.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979699
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
+\change_inserted 0 1298979700
+Complete.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+Extending The Header Is Difficult
+\end_layout
+
+\begin_layout Standard
+We have reserved (zeroed) words in the TDB header, which can be used for
+ future features.
+ If the future features are compulsory, the version number must be updated
+ to prevent old code from accessing the database.
+ But if the future feature is optional, we have no way of telling if older
+ code is accessing the database or not.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain a
+\begin_inset Quotes eld
+\end_inset
+
+format variant
+\begin_inset Quotes erd
+\end_inset
+
+ value (64-bit).
+ This is divided into two 32-bit parts:
+\end_layout
+
+\begin_layout Enumerate
+The lower part reflects the format variant understood by code accessing
+ the database.
+\end_layout
+
+\begin_layout Enumerate
+The upper part reflects the format variant you must understand to write
+ to the database (otherwise you can only open for reading).
+\end_layout
+
+\begin_layout Standard
+The latter field can only be written at creation time, the former should
+ be written under the OPEN_LOCK when opening the database for writing, if
+ the variant of the code is lower than the current lowest variant.
+\end_layout
+
+\begin_layout Standard
+This should allow backwards-compatible features to be added, and detection
+ if older code (which doesn't understand the feature) writes to the database.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+Record Headers Are Not Expandible
+\end_layout
+
+\begin_layout Standard
+If we later want to add (say) checksums on keys and data, it would require
+ another format change, which we'd like to avoid.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We often have extra padding at the tail of a record.
+ If we ensure that the first byte (if any) of this padding is zero, we will
+ have a way for future changes to detect code which doesn't understand a
+ new format: the new code would write (say) a 1 at the tail, and thus if
+ there is no tail or the first byte is 0, we would know the extension is
+ not present on that record.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979837
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+\change_inserted 0 1298979837
+Complete.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Hash-Size-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
+\end_layout
+
+\begin_layout Standard
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
+
+\begin_layout Standard
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+It seems tempting to try to reuse the hash implementation which we use for
+ records here, but we have two ways of searching for free entries: for allocatio
+n we search by size (and possibly zone) which produces too many clashes
+ for our hash table to handle well, and for coalescing we search by address.
+ Thus an array of doubly-linked free lists seems preferable.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or table of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Unfortunately it is difficult to know what heuristics should be used to
+ determine zone sizes, and our transaction code relies on being able to
+ create a
+\begin_inset Quotes eld
+\end_inset
+
+recovery area
+\begin_inset Quotes erd
+\end_inset
+
+ by simply appending to the file (difficult if it would need to create a
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct free list.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the list (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a free table; usually the previous one.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
+\end_layout
+
+\begin_layout Enumerate
+If no list has an entry which meets our needs, try the next free table.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+
+\end_layout
+
+\begin_layout Standard
+Each free entry has the free table number in the header: less than 255.
+ It also contains a doubly-linked list for easy deletion.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic;   /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+        /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+                union {
+\end_layout
+
+\begin_layout LyX-Code
+                        char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        struct {
+\end_layout
+
+\begin_layout LyX-Code
+                                char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+                                char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        }
+\end_layout
+
+\begin_layout LyX-Code
+                        uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+                }
+\end_layout
+
+\begin_layout LyX-Code
+        */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+ Note that it's not clear that these bits will be a win, given the extra
+ bits in the hash table itself (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Hash-Size-Solution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t used_magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+                 key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+                 top_hash: 11;
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_magic: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                   prev : 56;
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                 total_length : 56
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t next;;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291206079
+
+\change_unchanged
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed SolutionNone.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+ (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\begin_layout Subsection
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Tridge points out that an attribute can be later added to tdb_open (see
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\end_body
+\end_document
+@
+
+
+1.12
+log
+@Add status, some fixes, linked freelists.
+@
+text
+@d53 1
+a53 7
+
+\change_deleted 0 1291204535
+14-September
+\change_inserted 0 1291204533
+1-December
+\change_unchanged
+-2010
+a580 2
+\change_inserted 0 1291204563
+
+a583 2
+
+\change_inserted 0 1291204572
+a587 2
+
+\change_inserted 0 1291204573
+a588 2
+\change_unchanged
+
+a629 2
+\change_inserted 0 1291204588
+
+a632 2
+
+\change_inserted 0 1291204588
+a636 2
+
+\change_inserted 0 1291204631
+a639 2
+\change_unchanged
+
+a693 2
+\change_inserted 0 1291204639
+
+a696 2
+
+\change_inserted 0 1291204640
+d702 1
+a702 1
+\change_inserted 0 1291204665
+d704 2
+a728 2
+\change_inserted 0 1291204671
+
+a731 2
+
+\change_inserted 0 1291204671
+a735 2
+
+\change_inserted 0 1291204673
+a736 2
+\change_unchanged
+
+a780 2
+\change_inserted 0 1291204731
+
+a783 2
+
+\change_inserted 0 1291204732
+a787 2
+
+\change_inserted 0 1291204779
+a790 2
+\change_unchanged
+
+a842 2
+\change_inserted 0 1291204830
+
+a845 2
+
+\change_inserted 0 1291204831
+a849 2
+
+\change_inserted 0 1291204834
+a850 2
+\change_unchanged
+
+d879 9
+a887 2
+ deal of churn; we are better to guarantee that the tdb_errcode is per-thread
+ so the current programming model can be maintained.
+d891 9
+d903 2
+a922 2
+\change_inserted 0 1291204847
+
+a925 2
+
+\change_inserted 0 1291204847
+d930 5
+a934 3
+
+\change_inserted 0 1291204852
+Incomplete.
+a1051 2
+\change_inserted 0 1291204881
+
+a1054 2
+
+\change_inserted 0 1291204881
+a1058 2
+
+\change_inserted 0 1291204885
+a1059 2
+\change_unchanged
+
+a1140 2
+\change_inserted 0 1291204898
+
+a1143 2
+
+\change_inserted 0 1291204898
+a1147 2
+
+\change_inserted 0 1291204901
+a1148 2
+\change_unchanged
+
+a1224 2
+\change_inserted 0 1291204908
+
+a1227 2
+
+\change_inserted 0 1291204908
+a1231 2
+
+\change_inserted 0 1291204908
+a1232 2
+\change_unchanged
+
+a1271 2
+\change_inserted 0 1291204917
+
+a1274 2
+
+\change_inserted 0 1291204917
+a1278 2
+
+\change_inserted 0 1291204920
+a1279 2
+\change_unchanged
+
+a1316 2
+\change_inserted 0 1291204927
+
+a1319 2
+
+\change_inserted 0 1291204928
+d1325 1
+a1325 1
+\change_inserted 0 1291204942
+d1327 2
+a1381 2
+\change_inserted 0 1291205003
+
+a1384 2
+
+\change_inserted 0 1291205004
+a1388 2
+
+\change_inserted 0 1291205007
+a1411 2
+\change_inserted 0 1291205019
+
+a1414 2
+
+\change_inserted 0 1291205019
+a1418 2
+
+\change_inserted 0 1291205023
+a1419 2
+\change_unchanged
+
+a1465 2
+\change_inserted 0 1291205029
+
+a1468 2
+
+\change_inserted 0 1291205029
+a1472 2
+
+\change_inserted 0 1291206020
+a1473 2
+\change_unchanged
+
+a1528 2
+\change_inserted 0 1291205043
+
+a1531 2
+
+\change_inserted 0 1291205043
+d1537 1
+a1537 1
+\change_inserted 0 1291205057
+d1539 2
+a1589 2
+\change_inserted 0 1291205062
+
+a1592 2
+
+\change_inserted 0 1291205062
+a1596 2
+
+\change_inserted 0 1291205062
+a1597 2
+\change_unchanged
+
+a1626 2
+\change_inserted 0 1291205072
+
+a1629 2
+
+\change_inserted 0 1291205073
+a1633 2
+
+\change_inserted 0 1291205073
+a1634 2
+\change_unchanged
+
+a1674 4
+
+\change_deleted 0 1291204504
+
+\change_unchanged
+a1699 2
+\change_inserted 0 1291205079
+
+a1702 2
+
+\change_inserted 0 1291205080
+a1706 2
+
+\change_inserted 0 1291205080
+a1707 2
+\change_unchanged
+
+a1833 2
+\change_inserted 0 1291205090
+
+d1869 2
+a1870 7
+ is to divide the file into zones, and using a free list (or
+\change_inserted 0 1291205498
+table
+\change_deleted 0 1291205497
+set
+\change_unchanged
+ of free lists) for each.
+a1871 2
+\change_inserted 0 1291205203
+
+a1874 2
+
+\change_inserted 0 1291205358
+a1890 21
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205198
+Note that this means we need to split the free lists when we expand the
+ file; this is probably acceptable when we double the hash table size, since
+ that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary,
+\emph on
+and
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+\change_unchanged
+
+d1899 1
+a1899 7
+Identify the correct
+\change_inserted 0 1291205366
+free list
+\change_deleted 0 1291205364
+zone
+\change_unchanged
+.
+d1907 2
+a1908 7
+Re-check the
+\change_inserted 0 1291205372
+list
+\change_deleted 0 1291205371
+zone
+\change_unchanged
+ (we didn't have a lock, sizes could have changed): relock if necessary.
+d1912 1
+a1912 5
+Place the freed entry in the list
+\change_deleted 0 1291205382
+ for that zone
+\change_unchanged
+.
+d1921 1
+a1921 15
+Pick a
+\change_deleted 0 1291205403
+zone either the zone we last freed into, or based on a
+\begin_inset Quotes eld
+\end_inset
+
+random
+\begin_inset Quotes erd
+\end_inset
+
+ number.
+\change_inserted 0 1291205411
+free table; usually the previous one.
+\change_unchanged
+
+a1925 10
+\change_deleted 0 1291205432
+
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1291205428
+Re-check the zone: relock if necessary.
+\change_unchanged
+
+d1934 1
+a1934 7
+ unlock the list and try the next
+\change_inserted 0 1291205455
+largest list
+\change_deleted 0 1291205452
+zone.
+\change_inserted 0 1291205457
+
+a1937 2
+
+\change_inserted 0 1291205476
+a1938 2
+\change_unchanged
+
+a1966 2
+\change_inserted 0 1291205542
+
+a1969 2
+
+\change_inserted 0 1291205591
+a1971 70
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205539
+I anticipate that the number of entries in each free zone would be small,
+ but it might be worth using one free entry to hold pointers to the others
+ for cache efficiency.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205534
+\begin_inset CommandInset label
+LatexCommand label
+name "freelist-in-zone"
+
+\end_inset
+
+If we want to avoid locking complexity (enlarging the free lists when we
+ enlarge the file) we could place the array of free lists at the beginning
+ of each zone.
+ This means existing array lists never move, but means that a record cannot
+ be larger than a zone.
+ That in turn implies that zones should be variable sized (say, power of
+ 2), which makes the question
+\begin_inset Quotes eld
+\end_inset
+
+what zone is this record in?
+\begin_inset Quotes erd
+\end_inset
+
+ much harder (and
+\begin_inset Quotes eld
+\end_inset
+
+pick a random zone
+\begin_inset Quotes erd
+\end_inset
+
+, but that's less common).
+ It could be done with as few as 4 bits from the record header.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Using
+\begin_inset Formula $2^{16+N*3}$
+\end_inset
+
+means 0 gives a minimal 65536-byte zone, 15 gives the maximal
+\begin_inset Formula $2^{61}$
+\end_inset
+
+ byte zone.
+ Zones range in factor of 8 steps.
+ Given the zone size for the zone the current record is in, we can determine
+ the start of the zone.
+\end_layout
+
+\end_inset
+
+
+\change_inserted 0 1291205139
+
+d2218 1
+a2218 5
+        uint32_t
+\change_inserted 0 1291205758
+used_
+\change_unchanged
+magic : 16,
+a2222 4
+\change_deleted 0 1291205693
+                 prev_is_free: 1,
+\change_unchanged
+
+d2230 1
+a2230 7
+                 top_hash: 1
+\change_inserted 0 1291205704
+1
+\change_deleted 0 1291205704
+0
+\change_unchanged
+;
+d2254 1
+a2254 9
+        uint
+\change_inserted 0 1291205725
+64
+\change_deleted 0 1291205723
+32
+\change_unchanged
+_t
+\change_inserted 0 1291205753
+free_magic: 8,
+a2257 2
+
+\change_inserted 0 1291205746
+a2262 24
+\change_deleted 0 1291205749
+free_magic;
+\change_unchanged
+
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t
+\change_inserted 0 1291205786
+free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+
+\change_inserted 0 1291205788
+
+\change_unchanged
+total_length
+\change_inserted 0 1291205792
+ : 56
+\change_deleted 0 1291205790
+;
+\change_unchanged
+
+d2266 1
+a2266 7
+        uint64_t
+\change_deleted 0 1291205801
+prev,
+\change_unchanged
+next;
+\change_deleted 0 1291205811
+
+d2270 1
+a2270 3
+
+\change_deleted 0 1291205811
+        ...
+d2274 1
+a2274 5
+
+\change_deleted 0 1291205808
+        uint64_t tailer
+\change_unchanged
+;
+d2283 5
+a2287 16
+\change_deleted 0 1291205827
+We might want to take some bits from the used record's top_hash (and the
+ free record which has 32 bits of padding to spare anyway) if we use variable
+ sized zones.
+ See
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "freelist-in-zone"
+
+\end_inset
+
+.
+
+\change_inserted 0 1291205885
+ Note that by limiting valid offsets to 56 bits, we can pack everything
+ we need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+a2290 2
+
+\change_inserted 0 1291205886
+a2294 2
+
+\change_inserted 0 1291205886
+a2295 2
+\change_unchanged
+
+a2385 2
+\change_inserted 0 1291205894
+
+a2388 2
+
+\change_inserted 0 1291205894
+a2392 2
+
+\change_inserted 0 1291205902
+a2393 2
+\change_unchanged
+
+a2415 4
+
+\change_deleted 0 1291204504
+
+\change_unchanged
+a2445 2
+\change_inserted 0 1291205910
+
+a2448 2
+
+\change_inserted 0 1291205910
+a2452 2
+
+\change_inserted 0 1291205914
+a2453 2
+\change_unchanged
+
+a2485 2
+\change_inserted 0 1291205919
+
+a2488 2
+
+\change_inserted 0 1291205919
+a2492 2
+
+\change_inserted 0 1291205922
+a2493 2
+\change_unchanged
+
+a2533 2
+\change_inserted 0 1291205929
+
+a2536 2
+
+\change_inserted 0 1291205929
+a2540 2
+
+\change_inserted 0 1291205929
+a2541 2
+\change_unchanged
+
+a2578 2
+\change_inserted 0 1291205932
+
+a2581 2
+
+\change_inserted 0 1291205933
+a2585 2
+
+\change_inserted 0 1291205933
+a2586 2
+\change_unchanged
+
+a2724 2
+\change_inserted 0 1291205944
+
+a2727 2
+
+\change_inserted 0 1291205945
+a2731 2
+
+\change_inserted 0 1291205948
+a2732 2
+\change_unchanged
+
+@
+
+
+1.11
+log
+@Merge changes
+@
+text
+@d53 7
+a59 1
+14-September-2010
+d587 16
+d644 18
+d716 16
+d753 16
+d813 18
+d883 16
+d953 16
+d1084 16
+d1181 16
+d1273 16
+d1328 16
+d1381 16
+d1447 19
+a1465 2
+ if older code (which doesn't understand the feature) writes to the database.Reco
+rd Headers Are Not Expandible
+d1484 16
+d1546 16
+d1617 16
+d1680 16
+d1725 16
+d1810 16
+d1951 8
+a1958 3
+Proposed SolutionThe first step is to remove all the current heuristics,
+ as they obviously interact, then examine them once the lock contention
+ is addressed.
+d1989 7
+a1995 2
+ is to divide the file into zones, and using a free list (or set of free
+ lists) for each.
+d1997 2
+d2002 25
+d2039 2
+d2049 7
+a2055 1
+Identify the correct zone.
+d2063 7
+a2069 2
+Re-check the zone (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+d2073 5
+a2077 1
+Place the freed entry in the list for that zone.
+d2086 3
+a2088 1
+Pick a zone either the zone we last freed into, or based on a
+d2097 4
+d2105 2
+d2110 2
+d2113 2
+d2123 15
+a2137 1
+ unlock the list and try the next zone.
+d2166 11
+d2180 2
+d2185 2
+d2190 2
+d2223 1
+a2223 1
+status open
+d2243 2
+d2491 5
+a2495 1
+        uint32_t magic : 16,
+d2499 2
+d2502 2
+d2511 7
+a2517 1
+                 top_hash: 10;
+d2541 29
+a2569 1
+        uint32_t free_magic;
+d2573 11
+a2583 1
+        uint64_t total_length;
+d2587 7
+a2593 1
+        uint64_t prev, next;
+d2597 2
+d2603 5
+a2607 1
+        uint64_t tailer;
+d2615 2
+d2628 18
+d2736 16
+d2808 16
+d2856 16
+d2912 16
+d2965 16
+d3119 16
+@
+
+
+1.10
+log
+@Tracing attribute, talloc support.
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+d53 1
+a53 7
+
+\change_deleted 0 1283307542
+26-July
+\change_inserted 0 1284423485
+14-September
+\change_unchanged
+-2010
+a472 2
+\change_inserted 0 1284422789
+
+a479 2
+\change_unchanged
+
+a838 2
+
+\change_inserted 0 1284016998
+a846 2
+\change_unchanged
+
+a1194 2
+\change_inserted 0 1284015637
+
+a1197 2
+
+\change_inserted 0 1284015716
+a1201 2
+
+\change_inserted 0 1284015906
+a1210 2
+
+\change_inserted 0 1284015637
+a1214 2
+
+\change_inserted 0 1284016114
+a1227 2
+
+\change_inserted 0 1284016149
+a1232 2
+
+\change_inserted 0 1284016639
+a1237 2
+
+\change_inserted 0 1284016821
+a1243 2
+
+\change_inserted 0 1284016803
+d1245 2
+a1246 9
+ if older code (which doesn't understand the feature) writes to the database.
+\change_deleted 0 1284016101
+
+\end_layout
+
+\begin_layout Subsection
+
+\change_inserted 0 1284015634
+Record Headers Are Not Expandible
+a1249 2
+
+\change_inserted 0 1284015634
+a1254 2
+
+\change_inserted 0 1284015634
+a1258 2
+
+\change_inserted 0 1284422552
+a1267 2
+
+\change_inserted 0 1284422568
+a1271 2
+
+\change_inserted 0 1284422646
+a1276 2
+
+\change_inserted 0 1284422656
+a1280 2
+
+\change_inserted 0 1284423065
+a1305 2
+
+\change_inserted 0 1284423042
+a1310 2
+\change_unchanged
+
+a1457 2
+
+\change_inserted 0 1283336713
+a1463 2
+
+\change_unchanged
+d1482 2
+d1485 1
+a1485 51
+\change_deleted 0 1283307675
+There are three details which become important:
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+On encountering a full bucket, we use the next bucket.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+Extra hash bits are stored with the offset, to reduce comparisons.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+A marker entry is used on deleting an entry.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+The doubling of the table must be done under a transaction; we will not
+ reduce it on deletion, so it will be an unusual case.
+ It will either be placed at the head (other entries will be moved out the
+ way so we can expand).
+ We could have a pointer in the header to the current hashtable location,
+ but that pointer would have to be read frequently to check for hashtable
+ moves.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+The locking for this is slightly more complex than the chained case; we
+ currently have one lock per bucket, and that means we would need to expand
+ the lock if we overflow to the next bucket.
+ The frequency of such collisions will effect our locking heuristics: we
+ can always lock more buckets than we need.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+One possible optimization is to only re-check the hash size on an insert
+ or a lookup miss.
+
+\change_inserted 0 1283307770
+a1492 2
+
+\change_inserted 0 1283336187
+a1500 2
+
+\change_inserted 0 1283336586
+a1510 2
+\change_unchanged
+
+d1636 3
+a1638 8
+Proposed Solution
+\change_deleted 0 1283336858
+
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+a1647 2
+\change_inserted 0 1283336910
+
+a1650 2
+
+\change_inserted 0 1283337052
+a1655 2
+\change_unchanged
+
+a1776 2
+\change_inserted 0 1283309850
+
+a1779 2
+
+\change_inserted 0 1283337216
+a1813 2
+
+\change_inserted 0 1284424151
+a1825 2
+\change_unchanged
+
+a1830 2
+\change_unchanged
+
+a2031 2
+
+\change_inserted 0 1283336739
+a2040 2
+\change_unchanged
+
+a2117 2
+\change_inserted 0 1283337133
+
+a2120 2
+
+\change_inserted 0 1283337139
+a2121 2
+\change_unchanged
+
+a2136 2
+
+\change_inserted 0 1283337235
+a2147 2
+\change_unchanged
+
+d2251 1
+a2251 7
+Proposed Solution
+\change_deleted 0 1284423472
+
+\end_layout
+
+\begin_layout Standard
+None.
+d2261 1
+a2261 1
+\change_inserted 0 1284423891
+d2263 1
+a2263 4
+\change_deleted 0 1284423891
+.
+
+\change_inserted 0 1284423901
+a2271 2
+\change_unchanged
+
+a2293 2
+\change_inserted 0 1284423495
+
+a2312 2
+
+\change_inserted 0 1284424201
+d2321 1
+a2321 3
+
+\change_unchanged
+We could solve a small part of the problem by providing read-only transactions.
+a2505 2
+\change_inserted 0 1284423555
+
+a2508 2
+
+\change_inserted 0 1284423617
+a2512 2
+
+\change_inserted 0 1284423719
+a2519 2
+
+\change_inserted 0 1284423864
+a2530 2
+
+\change_inserted 0 1284423850
+a2540 2
+\change_unchanged
+
+@
+
+
+1.9
+log
+@Extension mechanism.
+@
+text
+@d56 2
+a57 2
+\change_inserted 0 1284016854
+9-September
+d479 11
+d1303 1
+a1303 1
+\change_inserted 0 1284016847
+d1310 56
+d1945 1
+a1945 1
+\change_inserted 0 1283310945
+d1956 2
+d2402 2
+d2416 4
+d2421 12
+d2455 2
+d2476 12
+d2673 47
+@
+
+
+1.8
+log
+@Remove bogus footnote
+@
+text
+@d56 2
+a57 2
+\change_inserted 0 1283307544
+1-September
+d838 12
+d1198 103
+@
+
+
+1.7
+log
+@Moving hash table does not work.
+@
+text
+@a1436 12
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+
+\change_inserted 0 1283336450
+If we make the hash offsets zone-relative, then this only restricts the
+ zone size, not the overall database size.
+\end_layout
+
+\end_inset
+
+@
+
+
+1.6
+log
+@Commit changes
+@
+text
+@d38 1
+a38 1
+\author ""
+d53 7
+a59 1
+26-July-2010
+d1333 10
+d1361 3
+a1363 1
+ There are three details which become important:
+d1367 2
+d1373 2
+d1379 2
+d1385 2
+d1397 2
+d1407 2
+d1411 45
+d1582 2
+d1598 14
+d1733 62
+d1996 13
+d2086 10
+d2110 15
+a2124 1
+\begin_layout LyX-Code
+@
+
+
+1.5
+log
+@Soft transaction commit
+@
+text
+@d38 1
+a38 1
+\author "Rusty Russell,,,"
+a52 4
+
+\change_deleted 0 1280141199
+10-May-2010
+\change_inserted 0 1280141202
+a53 2
+\change_unchanged
+
+a2028 2
+
+\change_inserted 0 1280140902
+a2034 2
+
+\change_unchanged
+a2212 2
+\change_inserted 0 1280140661
+
+a2215 2
+
+\change_inserted 0 1280140703
+a2219 2
+
+\change_inserted 0 1280708312
+a2226 2
+
+\change_inserted 0 1280708400
+a2239 2
+
+\change_inserted 0 1280140836
+a2243 2
+
+\change_inserted 0 1280708255
+a2247 2
+
+\change_inserted 0 1280708374
+a2252 2
+
+\change_inserted 0 1280141181
+a2274 2
+
+\change_inserted 0 1280141345
+@
+
+
+1.4
+log
+@Merge changes
+@
+text
+@d38 1
+a38 1
+\author ""
+d53 2
+d56 4
+d2035 10
+d2223 84
+@
+
+
+1.3
+log
+@Transaction and freelist rethink.
+@
+text
+@d38 1
+a38 1
+\author "Rusty Russell,,,"
+d53 1
+a53 1
+27-April-2010
+d662 1
+a662 5
+ behavior of disallowing
+\change_inserted 0 1272940179
+nested
+\change_unchanged
+transactions should become the default.
+a1210 2
+\change_inserted 0 1272944650
+
+a1214 2
+
+\change_inserted 0 1272944763
+a1218 2
+\change_unchanged
+
+a1223 2
+\change_unchanged
+
+a1301 2
+
+\change_inserted 0 1273478114
+a1310 2
+\change_unchanged
+
+d1515 1
+a1515 11
+The free list
+\change_deleted 0 1273469807
+should
+\change_inserted 0 1273469810
+must
+\change_unchanged
+ be split
+\change_deleted 0 1273469815
+into multiple lists
+\change_unchanged
+to reduce contention.
+a1520 2
+\change_inserted 0 1273470006
+
+a1523 2
+
+\change_inserted 0 1273492055
+a1539 2
+
+\change_inserted 0 1273483888
+a1551 2
+\change_unchanged
+
+a1554 8
+
+\change_deleted 0 1272942055
+There are various ways to organize these lisys, but because we want to be
+ able to quickly identify which free list an entry is in, and reduce the
+ number of locks required for merging, we will use zoning (eg.
+ each free list covers some fixed fraction of the file).
+
+\change_inserted 0 1273484187
+d1556 1
+a1556 7
+
+\change_deleted 0 1273484194
+The algorithm for f
+\change_inserted 0 1273484194
+F
+\change_unchanged
+reeing is simple:
+d1560 1
+a1560 7
+Identify the correct
+\change_deleted 0 1273482856
+free list
+\change_inserted 0 1273482857
+zone
+\change_unchanged
+.
+d1564 1
+a1564 7
+Lock the
+\change_inserted 0 1273482895
+corresponding
+\change_unchanged
+list
+\change_inserted 0 1273482863
+.
+a1567 2
+
+\change_inserted 0 1273482909
+d1573 1
+a1573 13
+
+\change_deleted 0 1273482885
+, and p
+\change_inserted 0 1273482888
+P
+\change_unchanged
+lace the freed entry
+\change_deleted 0 1273492415
+at the head
+\change_inserted 0 1273492415
+in the list for that zone
+\change_unchanged
+.
+d1577 2
+a1578 7
+Allocation is a little more complicated, as we
+\change_deleted 0 1273483240
+merge entries as we walk the list:
+\change_inserted 0 1273484250
+perform delayed coalescing at this point:
+\change_unchanged
+
+d1582 1
+a1582 19
+Pick a
+\change_deleted 0 1273482955
+free list;
+\change_inserted 0 1273482957
+zone
+\change_unchanged
+ either the
+\change_deleted 0 1273482962
+list
+\change_inserted 0 1273482962
+zone
+\change_unchanged
+ we last freed
+\change_deleted 0 1273482966
+o
+\change_inserted 0 1273482966
+i
+\change_unchanged
+nto, or based on a
+d1594 1
+a1594 9
+Lock th
+\change_inserted 0 1273482980
+e corresponding
+\change_deleted 0 1273482973
+at
+\change_unchanged
+ list.
+\change_inserted 0 1273482982
+
+a1597 2
+
+\change_inserted 0 1273483084
+a1598 53
+\change_unchanged
+
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is
+\change_deleted 0 1273492155
+well-sized,
+\change_inserted 0 1273492159
+-large enough,
+\change_unchanged
+remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise,
+\change_inserted 0 1273492206
+coalesce entries in the list.
+\change_deleted 0 1273492200
+examine the entry to the right of it in the file.
+ If it is free:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+If that entry is in a different list, lock that list too.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+If we had to place a new lock, re-check that the entry is free.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+Remove that entry from its free list and expand this entry to cover it.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273485554
+Goto step 3.
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+
+\change_inserted 0 1273485311
+If there was no entry large enough, unlock the list and try the next zone.
+d1602 1
+a1602 5
+
+\change_deleted 0 1273483646
+Repeat step 3 with each entry in the list.
+\change_unchanged
+
+d1606 2
+a1607 5
+
+\change_deleted 0 1273483668
+Unlock the list and repeat step 2 with the next list.
+\change_unchanged
+
+d1611 1
+a1611 7
+If no
+\change_deleted 0 1273483671
+list
+\change_inserted 0 1273483671
+zone
+\change_unchanged
+ satisfies, expand the file.
+d1615 2
+a1616 9
+This optimizes rapid insert/delete of free list entries
+\change_inserted 0 1273485794
+ by not coalescing them all the time.
+\change_deleted 0 1273483685
+, and allows us to get rid of the tailer altogether
+\change_unchanged
+.
+
+\change_inserted 0 1273492299
+a1638 39
+
+\change_deleted 0 1273476840
+The question of
+\begin_inset Quotes eld
+\end_inset
+
+well-sized
+\begin_inset Quotes erd
+\end_inset
+
+ free entries is more difficult: the 25% overhead works in practice for
+ ldb because indexes tend to expand by one record at a time.
+ This can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+ Whether the
+\begin_inset Quotes eld
+\end_inset
+
+increasing slack
+\begin_inset Quotes erd
+\end_inset
+
+ algorithm should be implemented or first-fit used is still unknown: we
+ will determine this once these other ideas are implemented.
+\change_inserted 0 1273483750
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1273492450
+a1644 2
+
+\change_inserted 0 1273470441
+a1654 2
+
+\change_inserted 0 1273476556
+a1659 2
+
+\change_inserted 0 1273470423
+a1661 2
+\change_unchanged
+
+a1672 2
+
+\change_inserted 0 1273476847
+a1676 2
+
+\change_inserted 0 1273476886
+a1691 2
+
+\change_inserted 0 1273477233
+a1699 2
+
+\change_inserted 0 1273477534
+a1706 2
+
+\change_inserted 0 1273482700
+a1712 2
+
+\change_inserted 0 1273478079
+a1722 2
+
+\change_inserted 0 1273477839
+a1726 2
+
+\change_inserted 0 1273477925
+a1730 2
+
+\change_inserted 0 1273477925
+a1734 2
+
+\change_inserted 0 1273477925
+a1738 2
+
+\change_inserted 0 1273477925
+a1742 2
+
+\change_inserted 0 1273477925
+a1746 2
+
+\change_inserted 0 1273477925
+a1750 2
+
+\change_inserted 0 1273477925
+a1754 2
+
+\change_inserted 0 1273477925
+a1758 2
+
+\change_inserted 0 1273477925
+a1762 2
+
+\change_inserted 0 1273477925
+a1766 2
+
+\change_inserted 0 1273477925
+a1770 2
+
+\change_inserted 0 1273477925
+a1774 2
+
+\change_inserted 0 1273477925
+a1778 2
+
+\change_inserted 0 1273477925
+a1782 2
+
+\change_inserted 0 1273477925
+a1786 2
+
+\change_inserted 0 1273477925
+a1790 2
+
+\change_inserted 0 1273477925
+a1794 2
+
+\change_inserted 0 1273477925
+a1798 2
+
+\change_inserted 0 1273492522
+a1802 2
+
+\change_inserted 0 1273492530
+a1806 2
+
+\change_inserted 0 1273492546
+a1810 2
+
+\change_inserted 0 1273478239
+a1814 2
+
+\change_inserted 0 1273479960
+a1821 2
+
+\change_inserted 0 1273480265
+a1830 2
+
+\change_inserted 0 1273480354
+a1845 2
+
+\change_inserted 0 1273478968
+a1851 2
+
+\change_inserted 0 1273492604
+a1859 2
+
+\change_inserted 0 1273479572
+a1862 2
+\change_unchanged
+
+a1870 2
+
+\change_inserted 0 1273480282
+a1874 2
+
+\change_inserted 0 1273478931
+a1878 2
+
+\change_inserted 0 1273481549
+a1882 2
+
+\change_inserted 0 1273481557
+a1886 2
+
+\change_inserted 0 1273480307
+a1890 2
+
+\change_inserted 0 1273480335
+a1894 2
+
+\change_inserted 0 1273479897
+a1898 2
+
+\change_inserted 0 1273479653
+a1902 2
+
+\change_inserted 0 1273480371
+a1906 2
+
+\change_inserted 0 1273480464
+a1910 2
+
+\change_inserted 0 1273480399
+a1914 2
+
+\change_inserted 0 1273480425
+a1918 2
+
+\change_inserted 0 1273480453
+a1922 2
+
+\change_inserted 0 1273480455
+a1926 2
+
+\change_inserted 0 1273480450
+a1930 2
+
+\change_inserted 0 1273480452
+a1935 2
+\change_inserted 0 1273478830
+
+a1942 5
+
+\change_deleted 0 1273481604
+In theory, we could get away with 2: one after we write the new data, and
+ one to somehow atomically change over to it.
+\change_inserted 0 1273481632
+a1946 2
+
+\change_inserted 0 1273481724
+a1950 2
+
+\change_inserted 0 1273481713
+a1954 2
+
+\change_inserted 0 1273481717
+a1958 2
+
+\change_inserted 0 1273481730
+a1962 2
+
+\change_inserted 0 1273481736
+a1966 2
+
+\change_inserted 0 1273481744
+a1970 2
+
+\change_inserted 0 1273481748
+a1974 2
+
+\change_inserted 0 1273482185
+a1978 2
+
+\change_inserted 0 1273482259
+a1989 50
+
+\change_deleted 0 1273481848
+None.
+ Trying to rewrite the transaction code is a separate experiment, which
+ I encourage someone else to do.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481848
+But as a thought experiment:
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481788
+Say there was a pointer in the header which said where the hash table and
+ free list tables were, and that no blocks were labeled with whether they
+ were free or not (it had to be derived from what list they were in).
+ We could create new hash table and free list in some free space, and populate
+ it as we want the post-committed state to look.
+ Then we sync, then we switch the offset in the header, then we sync again.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481788
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\change_inserted 0 1273481854
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1273482102
+a1993 2
+
+\change_inserted 0 1273482061
+a1998 2
+
+\change_inserted 0 1273482063
+a2002 2
+
+\change_inserted 0 1273482072
+a2006 2
+
+\change_inserted 0 1273482139
+a2011 2
+
+\change_inserted 0 1273482364
+a2015 2
+
+\change_inserted 0 1273482163
+a2019 2
+
+\change_inserted 0 1273482493
+a2037 2
+
+\change_inserted 0 1273482536
+a2046 2
+\change_unchanged
+
+a2049 2
+
+\change_inserted 0 1273482641
+a2058 2
+
+\change_inserted 0 1273481827
+d2067 2
+a2068 11
+We could
+\change_inserted 0 1273481829
+then
+\change_unchanged
+implement snapshots using a similar method
+\change_deleted 0 1273481838
+ to the above, only
+\change_inserted 0 1273481840
+,
+\change_unchanged
+ using multiple different hash tables/free tables.
+@
+
+
+1.2
+log
+@After first feedback (Ronnie & Volker)
+@
+text
+@d1314 13
+d1531 11
+a1541 1
+The free list should be split into multiple lists to reduce contention.
+d1547 39
+d1596 7
+d1604 1
+a1604 1
+The algorithm for freeing is simple:
+d1608 7
+a1614 1
+Identify the correct free list.
+d1618 30
+a1647 1
+Lock the list, and place the freed entry at the head.
+d1651 7
+a1657 2
+Allocation is a little more complicated, as we merge entries as we walk
+ the list:
+d1661 19
+a1679 1
+Pick a free list; either the list we last freed onto, or based on a
+d1691 17
+a1707 1
+Lock that list.
+d1711 7
+a1717 1
+If the top entry is well-sized, remove it from the list and return it.
+d1721 5
+a1725 1
+Otherwise, examine the entry to the right of it in the file.
+d1731 2
+d1737 2
+d1743 2
+d1749 2
+d1756 8
+d1765 2
+d1770 2
+d1773 2
+d1778 7
+a1784 1
+If no list satisfies, expand the file.
+d1788 28
+a1815 2
+This optimizes rapid insert/delete of free list entries, and allows us to
+ get rid of the tailer altogether.
+d1819 2
+d1851 1
+a1851 1
+\change_inserted 0 1272941474
+d1857 303
+a2159 18
+\change_inserted 0 1272942759
+There are various ways to organize these lists, but because we want to be
+ able to quickly identify which free list an entry is in, and reduce the
+ number of locks required for merging, we will use zoning (eg.
+ each of the N free lists in a tdb file of size M covers a fixed fraction
+ M/N).
+ Note that this means we need to reshuffle the free lists when we expand
+ the file; this is probably acceptable when we double the hash table size,
+ since that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary,
+\emph on
+and
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+d2164 107
+d2276 2
+d2280 59
+d2346 2
+d2363 2
+d2366 2
+d2371 2
+d2382 2
+d2389 57
+d2458 13
+d2474 32
+a2505 2
+We could implement snapshots using a similar method to the above, only using
+ multiple different hash tables/free tables.
+@
+
+
+1.1
+log
+@Initial revision
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.4 created this file. For more info see http://www.lyx.org/
+d36 3
+a38 3
+\tracking_changes false
+\output_changes false
+\author ""
+d662 5
+a666 1
+ behavior of disallowing transactions should become the default.
+d1215 21
+d1527 2
+d1533 3
+a1535 1
+ The algorithm for freeing is simple:
+d1642 26
+@
diff --git a/lib/ntdb/doc/design.pdf b/lib/ntdb/doc/design.pdf
new file mode 100644
index 0000000000..558dc1f8c2
Binary files /dev/null and b/lib/ntdb/doc/design.pdf differ
diff --git a/lib/ntdb/doc/design.txt b/lib/ntdb/doc/design.txt
new file mode 100644
index 0000000000..bd2ffde4db
--- /dev/null
+++ b/lib/ntdb/doc/design.txt
@@ -0,0 +1,1258 @@
+TDB2: A Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+1-December-2010
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases
+heading towards the 4G limit, that must change. This required
+breakage provides an opportunity to revisit TDB's other design
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as
+a simple key/data pair storage system with the same API as dbm,
+but allowing multiple readers and writers while being small
+enough (< 1000 lines of C) to include in SAMBA. The simple design
+created in 1999 has proven surprisingly robust and performant,
+used in Samba versions 3 and 4 as well as numerous other
+projects. Its useful life was greatly increased by the
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead
+to some organic growth of the API, as well as some compromises on
+the implementation. None of these, by themselves, are seen as
+show-stoppers, but the cumulative effect is to a loss of elegance
+over the initial, simple TDB implementation. Here is a table of
+the approximate number of lines of implementation code and number
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End  | API Functions  | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+|   1999    |      13        |              1195              |
++-----------+----------------+--------------------------------+
+|   2000    |      24        |              1725              |
++-----------+----------------+--------------------------------+
+|   2001    |      32        |              2228              |
++-----------+----------------+--------------------------------+
+|   2002    |      35        |              2481              |
++-----------+----------------+--------------------------------+
+|   2003    |      35        |              2552              |
++-----------+----------------+--------------------------------+
+|   2004    |      40        |              2584              |
++-----------+----------------+--------------------------------+
+|   2005    |      38        |              2647              |
++-----------+----------------+--------------------------------+
+|   2006    |      52        |              3754              |
++-----------+----------------+--------------------------------+
+|   2007    |      66        |              4398              |
++-----------+----------------+--------------------------------+
+|   2008    |      71        |              4768              |
++-----------+----------------+--------------------------------+
+|   2009    |      73        |              5715              |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known
+issues with TDB and create solutions which address the problems
+without significantly increasing complexity; all involved are far
+too aware of the dangers of second system syndrome in rewriting a
+successful project like this.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an
+optional hashing function and an optional logging function
+argument. Additional arguments to open would require the
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution<attributes>
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+    TDB_ATTRIBUTE_LOG = 0,
+
+    TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+    enum tdb_attribute attr;
+
+    union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
+*/
+
+    tdb_log_func log_fn;
+
+    void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
+*/
+
+    tdb_hash_func hash_fn;
+
+    void *hash_private;
+
+};
+
+union tdb_attribute {
+
+    struct tdb_attribute_base base;
+
+    struct tdb_attribute_log log;
+
+    struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands
+the size of the union.
+
+2.1.2 Status
+
+Complete.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
+and it was thought that it was important to guarantee that all
+records which exist at the start and end of the traversal would
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not
+work anyway for records which are altered (in particular, those
+which are expanded may be effectively deleted and re-added behind
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes
+occur during your traversal, otherwise you will see some subset.
+You can prevent changes by using a transaction or the locking
+API.
+
+2.2.2 Status
+
+Complete. Delete-during-traverse will still delete every record,
+too (assuming no other changes).
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not
+allowing them. Various paths in the Samba codebase assume that
+transactions will nest, and in a sense they can: the operation is
+only committed to disk when the outer transaction is committed.
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer
+  transaction commit to fail, and will not undo any operations
+  since the inner transaction began. This problem is soluble with
+  some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer
+  transaction. This is desirable in the way which Samba's
+  database initialization code uses transactions, but could be a
+  surprise to any users expecting a successful transaction commit
+  to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(),
+with the default currently that nested transactions are allowed.
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the “least-surprise”
+behavior of disallowing nested transactions should become the
+default. Additionally, it seems the outer transaction is the only
+code which knows whether inner transactions should be allowed, so
+a flag to indicate this could be added to tdb_transaction_start.
+However, this behavior can be simulated with a wrapper which uses
+tdb_add_flags() and tdb_remove_flags(), so the API should not be
+expanded for this relatively-obscure case.
+
+2.3.2 Status
+
+Incomplete; nesting flag is still defined as per tdb1.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash
+function to use, but does not check that all other processes
+accessing this tdb are using the same hash function. The result
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of
+0xdeadbeef), and tdb_open_ex() should check that the given hash
+function produces the same answer, or fail the tdb_open call.
+
+2.4.2 Status
+
+Complete.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB:
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
+latter actually calls the former with an argument of “5”.
+
+This code allows deleted records to accumulate without putting
+them in the free list. On delete we iterate through each chain
+and free them in a batch if there are more than max_dead entries.
+These are never otherwise recycled except as a side-effect of a
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API
+can be removed. The TDB_VOLATILE flag may still be useful as a
+hint that store and delete of records will be at least as common
+as fetch in order to allow some internal tuning, but initially
+will become a no-op.
+
+2.5.2 Status
+
+Incomplete. TDB_VOLATILE still defined, but implementation should
+fail on unknown flags to be future-proof.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
+  In The Same Process
+
+No process can open the same TDB twice; we check and disallow it.
+This is an unfortunate side-effect of fcntl locks, which operate
+on a per-file rather than per-file-descriptor basis, and do not
+nest. Thus, closing any file descriptor on a file clears all the
+locks obtained by this process, even if they were placed using a
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if
+operations were nested: this is a more manageable programming
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby
+Linux to violate them so that the most common implementation does
+not have this restriction. This would be a generally good idea
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to
+multiple callers if this happens, and does simple reference
+counting. We should do this inside the tdb library, which already
+emulates lock nesting internally; it would need to recognize when
+deadlock occurs within a single process. This would create a new
+failure mode for tdb operations (while we currently handle
+locking failures, they are impossible in normal use and a process
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate
+whether re-opening is allowed, as though there may be some
+benefit to adding a call to detect when a tdb_context is shared,
+to allow other to create such an API.
+
+2.6.2 Status
+
+Incomplete.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an
+operation to determine what went wrong. This programming model
+does not work with threads, unless specific additional guarantees
+are given by the implementation. In addition, even
+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a
+great deal of churn; we are better to guarantee that the
+tdb_errcode is per-thread so the current programming model can be
+maintained.
+
+This requires dynamic per-thread allocations, which is awkward
+with POSIX threads (pthread_key_create space is limited and we
+cannot simply allocate a key for every TDB).
+
+Internal locking is required to make sure that fcntl locks do not
+overlap between threads, and also that the global list of tdbs is
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a
+pthread-safe version of the library, and otherwise no overhead
+will exist. Alternatively, a hooking mechanism similar to that
+proposed for [Proposed-Solution-locking-hook] could be used to
+enable pthread locking at runtime.
+
+2.7.2 Status
+
+Incomplete.
+
+2.8 *_nonblock Functions And *_mark Functions Expose
+  Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg.
+  tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal
+  variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we
+  already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg.
+  tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg.
+  tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two
+new variants of each function clutters the API for an obscure
+use, and so not all functions have three variants. Secondly, it
+assumes that all paths of the functions ask for the same locks,
+otherwise the parent process will have to get a lock which the
+child doesn't have under some circumstances. I don't believe this
+is currently the case, but it constrains the implementation.
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can
+control the calls to create and remove fcntl locks. In this
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the
+  lock. Initially, this is always false. If so, return 0.
+  Otherwise, try to obtain it in non-blocking mode. If that
+  fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if
+  so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that
+  information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario,
+even when lock requirements change. It can be optimized so that
+the parent does not release locks, just tells the child which
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where
+it is needed.
+
+2.8.2 Status
+
+Incomplete.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record
+indicated by the given key. This gave atomicity guarantees;
+no-one can start a transaction, alter, read or delete that key
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain,
+which is an internal implementation detail and potentially a
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock
+which effected no other keys. Unfortunately, this won't work for
+an entry which doesn't exist. Thus while chainlock may be
+implemented more efficiently for the existing case, it will still
+have overlap issues with the non-existing case. So it is best to
+keep the current (lack of) guarantee about which records will be
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler
+to indicate that the tdb locking code should return with a
+failure, rather than trying again when a signal is received (and
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is
+received before the tdb code enters the fcntl() call to place the
+lock: the code will sleep within the fcntl() code, unaware that
+the signal wants it to exit. In the case of long timeouts, this
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook]
+would allow the user to decide on whether to fail the lock
+acquisition on a signal. This allows the caller to choose their
+own compromise: they could narrow the race by checking
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations
+by having the signal handler alter the struct flock to make it
+invalid. This will cause the fcntl() lock call to fail with
+EINVAL if the signal occurs before the kernel is entered,
+otherwise EAGAIN.
+]
+
+2.10.2 Status
+
+Incomplete.
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types
+can differ across implementations, or arguably in the case of
+function pointer definitions which are hard for humans to parse.
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and
+macros.
+
+  TDB_CONTEXT There is no reason to use this over 'struct
+  tdb_context'; the definition isn't visible to the API user
+  anyway.
+
+  TDB_DATA There is no reason to use this over struct TDB_DATA;
+  the struct needs to be understood by the API user.
+
+  struct TDB_DATA This would normally be called 'struct
+  tdb_data'.
+
+  enum TDB_ERROR Similarly, this would normally be enum
+  tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like
+myself, but if it were done the existing ones should be kept.
+There is little point forcing a purely cosmetic change upon tdb
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
+  Private Pointer
+
+For API compatibility reasons, the logging function needs to call
+tdb_get_logging_private() to retrieve the pointer registered by
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to
+break the API/ABI.
+
+2.12.2 Status
+
+Complete.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
+and tdb_check all take void * and must internally convert it to
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions
+which give a warning when used on gcc and the types of the
+callback and its private argument differ. Unsupported compilers
+will not give a warning, which is no worse than now. In addition,
+the callbacks become clearer, as they need not use void * for
+their parameter.
+
+See CCAN's typesafe_cb module at
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.13.2 Status
+
+Incomplete.
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
+  tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
+file should be cleared if the caller discovers it is the only
+process with the TDB open. However, if any caller does not
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or
+otherwise closes the tdb) before the child calls tdb_reopen_all()
+to establish the lock used to indicate the TDB is opened by
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+2.14.2 Status
+
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
+implemented.
+
+2.15 Extending The Header Is Difficult
+
+We have reserved (zeroed) words in the TDB header, which can be
+used for future features. If the future features are compulsory,
+the version number must be updated to prevent old code from
+accessing the database. But if the future feature is optional, we
+have no way of telling if older code is accessing the database or
+not.
+
+2.15.1 Proposed Solution
+
+The header should contain a “format variant” value (64-bit). This
+is divided into two 32-bit parts:
+
+1. The lower part reflects the format variant understood by code
+  accessing the database.
+
+2. The upper part reflects the format variant you must understand
+  to write to the database (otherwise you can only open for
+  reading).
+
+The latter field can only be written at creation time, the former
+should be written under the OPEN_LOCK when opening the database
+for writing, if the variant of the code is lower than the current
+lowest variant.
+
+This should allow backwards-compatible features to be added, and
+detection if older code (which doesn't understand the feature)
+writes to the database.
+
+2.15.2 Status
+
+Incomplete.
+
+2.16 Record Headers Are Not Expandible
+
+If we later want to add (say) checksums on keys and data, it
+would require another format change, which we'd like to avoid.
+
+2.16.1 Proposed Solution
+
+We often have extra padding at the tail of a record. If we ensure
+that the first byte (if any) of this padding is zero, we will
+have a way for future changes to detect code which doesn't
+understand a new format: the new code would write (say) a 1 at
+the tail, and thus if there is no tail or the first byte is 0, we
+would know the extension is not present on that record.
+
+2.16.2 Status
+
+Incomplete.
+
+2.17 TDB Does Not Use Talloc
+
+Many users of TDB (particularly Samba) use the talloc allocator,
+and thus have to wrap TDB in a talloc context to use it
+conveniently.
+
+2.17.1 Proposed Solution
+
+The allocation within TDB is not complicated enough to justify
+the use of talloc, and I am reluctant to force another
+(excellent) library on TDB users. Nonetheless a compromise is
+possible. An attribute (see [attributes]) can be added later to
+tdb_open() to provide an alternate allocation mechanism,
+specifically for talloc but usable by any other allocator (which
+would ignore the “context” argument).
+
+This would form a talloc heirarchy as expected, but the caller
+would still have to attach a destructor to the tdb context
+returned from tdb_open to close it. All TDB_DATA fields would be
+children of the tdb_context, and the caller would still have to
+manage them (using talloc_free() or talloc_steal()).
+
+2.17.2 Status
+
+Deferred.
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
+  Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
+never conflict in normal tdb usage, they do add substantial
+overhead for most fcntl lock implementations when the kernel
+scans to detect if a lock conflict exists. This is often a single
+linked list, making the time to acquire and release a fcntl lock
+O(N) where N is the number of processes with the TDB open, not
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients
+sitting idle, and thus they have weaned themselves off the
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this
+optimization: if the parent process will outlive the child, the
+child does not need the ACTIVE_LOCK. This is a workaround for
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers
+tend to know when they are initializing for the first time and
+can simply unlink the old tdb at that point.
+
+3.1.2 Status
+
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for “trivial”!),
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets
+internally rather than 32 bit as now. For simplicity of endian
+conversion (which TDB does on the fly if required), all values
+will be 64 bit on disk. In practice, some upper bits may be used
+for other purposes, but at least 56 bits will be available for
+file offsets.
+
+tdb_open() will automatically detect the old version, and even
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G
+(assuming that their off_t allows them to seek to 64 bits), they
+will gracefully fall back as they fail to mmap. This can happen
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28
+August 2009, commit 398d0c29290: prior to that any unrecognized
+file format would be erased and initialized as a fresh tdb!)
+
+3.2.2 Status
+
+Complete.
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t
+which can be 64 bit on 64 bit platforms. However, other limits
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit
+platforms which try to access such records (the current
+implementation would return TDB_ERR_OOM in a similar case). It
+seems unlikely that 32 bit keys will be a limitation, so the
+implementation may not support this (see [sub:Records-Incur-A]).
+
+3.3.2 Status
+
+Complete.
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is
+specified at creation time, and defaults to 131. This is such a
+bottleneck on large databases (as each hash chain gets quite
+long), that LDB uses 10,000 for this hash. In general it is
+impossible to know what the 'right' answer is at database
+creation time.
+
+3.4.1 <sub:Hash-Size-Solution>Proposed Solution
+
+After comprehensive performance testing on various scalable hash
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
+This was annoying because I was previously convinced that an
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash
+table which doubles in size when it reaches saturation.
+Unfortunately, altering the hash table introduces serious locking
+complications: the entire hash table needs to be locked to
+enlarge the hash table, and others might be holding locks.
+Particularly insidious are insertions done under tdb_chainlock.
+
+Thus an expanding layered hash will be used: an array of hash
+groups, with each hash group exploding into pointers to lower
+hash groups once it fills, turning into a hash tree. This has
+implications for locking: we must lock the entire group in case
+we need to expand it, yet we don't know how deep the tree is at
+that point.
+
+Note that bits from the hash table entries should be stolen to
+hold more hash bits to reduce the penalty of collisions. We can
+use the otherwise-unused lower 3 bits. If we limit the size of
+the database to 64 exabytes, we can use the top 8 bits of the
+hash entry as well. These 11 bits would reduce false positives
+down to 1 in 2000 which is more than we need: we can use one of
+the bits to indicate that the extra hash bits are valid. This
+means we can choose not to re-hash all entries when we expand a
+hash group; simply use the next bits we need and mark them
+invalid.
+
+3.4.2 Status
+
+Complete.
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but <
+  the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier,
+  return it. The slack will be turned into a new free record if
+  it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer
+  offset.
+
+3. If max_dead is non-zero:
+
+  (a) Walk the hash chain again and count the dead records.
+
+  (b) If it's more than max_dead, bulk free all the dead ones
+    (similar to steps 4 and below, but the lock is only obtained
+    once).
+
+  (c) Simply mark this record as dead and return.
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is
+  free; if so, enlarge the current block and remove that block
+  from the free list. This was disabled, as removal from the free
+  list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this
+  reason, each block has a 32-bit tailer which indicates its
+  length. If it is free, expand it to cover our new block and
+  return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step [right-merging]) causes
+fragmentation; the other heuristics proved insufficient to
+address this, so the final answer to this was that when we expand
+the TDB file inside a transaction commit, we repack the entire
+tdb.
+
+The single list lock limits our allocation rate; due to the other
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they
+obviously interact, then examine them once the lock contention is
+addressed.
+
+The free list must be split to reduce contention. Assuming
+perfect free merging, we can at most have 1 free list entry for
+each entry. This implies that the number of free lists is related
+to the size of the hash table, but as it is rare to walk a large
+number of free list entries we can use far fewer, say 1/32 of the
+number of hash buckets.
+
+It seems tempting to try to reuse the hash implementation which
+we use for records here, but we have two ways of searching for
+free entries: for allocation we search by size (and possibly
+zone) which produces too many clashes for our hash table to
+handle well, and for coalescing we search by address. Thus an
+array of doubly-linked free lists seems preferable.
+
+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common
+case where all processes are allocating/freeing the same size.
+Thus we almost certainly need to divide in other ways: the most
+obvious is to divide the file into zones, and using a free list
+(or table of free lists) for each. This approximates address
+ordering.
+
+Unfortunately it is difficult to know what heuristics should be
+used to determine zone sizes, and our transaction code relies on
+being able to create a “recovery area” by simply appending to the
+file (difficult if it would need to create a new zone header).
+Thus we use a linked-list of free tables; currently we only ever
+create one, but if there is more than one we choose one at random
+to use. In future we may use heuristics to add new free tables on
+contention. We only expand the file when all free tables are
+exhausted.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct free list.
+
+2. Lock the corresponding list.
+
+3. Re-check the list (we didn't have a lock, sizes could have
+  changed): relock if necessary.
+
+4. Place the freed entry in the list.
+
+Allocation is a little more complicated, as we perform delayed
+coalescing at this point:
+
+1. Pick a free table; usually the previous one.
+
+2. Lock the corresponding list.
+
+3. If the top entry is -large enough, remove it from the list and
+  return it.
+
+4. Otherwise, coalesce entries in the list.If there was no entry
+  large enough, unlock the list and try the next largest list
+
+5. If no list has an entry which meets our needs, try the next
+  free table.
+
+6. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not
+coalescing them all the time.. First-fit address ordering
+ordering seems to be fairly good for keeping fragmentation low
+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
+does not need a tailer to coalesce, though if we needed one we
+could have one cheaply: see [sub:Records-Incur-A].
+
+Each free entry has the free table number in the header: less
+than 255. It also contains a doubly-linked list for easy
+deletion.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation
+(aka overallocation) is deliberately set at 25%, and external
+fragmentation is only cured by the decision to repack the entire
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because
+indexes tend to expand by one record at a time. This internal
+fragmentation can be resolved by having an “expanded” bit in the
+header to note entries that have previously expanded, and
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external
+fragmentation: one is to use a fragmentation-avoiding allocation
+strategy such as best-fit address-order allocator. The other end
+of the spectrum would be to use a bump allocator (very fast and
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding
+allocators: they are non-trivial, they tend to use a single free
+list for each size, and there's no evidence that tdb allocation
+patterns will match those recorded for general allocators (though
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we
+will be no worse than the current code if we need to repack on
+occasion. More effort is spent on reducing freelist contention,
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+        tdb_off_t next; /* offset of the next record in the list
+*/
+
+        tdb_len_t rec_len; /* total byte length of record */
+
+        tdb_len_t key_len; /* byte length of key */
+
+        tdb_len_t data_len; /* byte length of data */
+
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+
+        uint32_t magic;   /* try to catch errors */
+
+        /* the following union is implied:
+
+                union {
+
+                        char record[rec_len];
+
+                        struct {
+
+                                char key[key_len];
+
+                                char data[data_len];
+
+                        }
+
+                        uint32_t totalsize; (tailer)
+
+                }
+
+        */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated
+block:
+
+1. The 'next' pointer is not required, as we are using a flat
+  hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len
+  and data_len (it accounts for wasted or overallocated length in
+  the record). Since the record length is always a multiple of 8,
+  we can conveniently fit it in 32 bits (representing up to 35
+  bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
+  restrict 'data_len' to 32 bits, but instead we can combine the
+  two into one 64-bit field and using a 5 bit value which
+  indicates at what bit to divide the two. Keys are unlikely to
+  scale as fast as data, so I'm assuming a maximum key size of 32
+  bits.
+
+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
+  this is diminishing returns after a handful of bits (at 10
+  bits, it reduces 99.9% of false memcmp). As an aside, as the
+  lower bits are already incorporated in the hash table
+  resolution, the upper bits should be used here. Note that it's
+  not clear that these bits will be a win, given the extra bits
+  in the hash table itself (see [sub:Hash-Size-Solution]).
+
+5. 'magic' does not need to be enlarged: it currently reflects
+  one of 5 values (used, free, dead, recovery, and
+  unused_recovery). It is useful for quick sanity checking
+  however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to
+  the right can find the header to check if this block is free).
+  This can be replaced by a single 'free' bit in the header of
+  the following block (and the tailer only exists in free
+  blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques.
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this,
+  however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+        uint32_t used_magic : 16,
+
+
+
+                 key_data_divide: 5,
+
+                 top_hash: 11;
+
+        uint32_t extra_octets;
+
+        uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+        uint64_t free_magic: 8,
+
+                   prev : 56;
+
+
+
+        uint64_t free_table: 8,
+
+                 total_length : 56
+
+        uint64_t next;;
+
+};
+
+Note that by limiting valid offsets to 56 bits, we can pack
+everything we need into 3 64-byte words, meaning our minimum
+record size is 8 bytes.
+
+3.7.2 Status
+
+Complete.
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync();
+
+On current ext3, each sync flushes all data to disk, so the next
+3 syncs are relatively expensive. But this could become a
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+Neil Brown points out that this is overzealous, and only one sync
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong
+  checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the
+  header (identified using the transaction counter) with the
+  offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a
+valid checksum and using the new data checksum to ensure that it
+has been applied. This is more expensive than the current check,
+but need only be done at open. For running databases, a separate
+header field can be used to indicate a transaction in progress;
+we need only check for recovery if this is set.
+
+3.8.2 Status
+
+Deferred.
+
+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed SolutionNone. At some point you say “use a real
+  database” (but see [replay-attribute]).
+
+But as a thought experiment, if we implemented transactions to
+only overwrite free entries (this is tricky: there must not be a
+header in each entry which indicates whether it is free, but use
+of presence in metadata elsewhere), and a pointer to the hash
+table, we could create an entirely new commit without destroying
+existing data. Then it would be easy to implement snapshots in a
+similar way.
+
+This would not allow arbitrary changes to the database, such as
+tdb_repack does, and would require more space (since we have to
+preserve the current and future entries at once). If we used hash
+trees rather than one big hash table, we might only have to
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using
+multiple different hash tables/free tables.
+
+3.9.2 Status
+
+Deferred.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with
+just about every update. It would add significant complexity in
+resolving clashes, and cause the all transaction callers to write
+their code to loop in the case where the transactions spuriously
+failed.
+
+3.10.1 Proposed Solution
+
+None (but see [replay-attribute]). We could solve a small part of
+the problem by providing read-only transactions. These would
+allow one write transaction to begin, but it could not commit
+until all r/o transactions are done. This would require a new
+RO_TRANSACTION_LOCK, which would be upgraded on commit.
+
+3.10.2 Status
+
+Deferred.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow
+(especially if we expand it to 64 bits), and works best when the
+hash bucket size is a prime number (which also means a slow
+modulus). In addition, it is highly predictable which could
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux
+kernel and almost everything else. This has the particular
+properties that it takes an initial seed, and produces two 32 bit
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random
+source, and placed in the header. This is far from foolproof, but
+adds a little bit of protection against hash bombing.
+
+3.11.2 Status
+
+Complete.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that
+lock in the delete code. If that grab on delete fails, we simply
+mark it deleted and continue onwards; traversal checks for this
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see [traverse-Proposed-Solution].
+
+3.12.2 Status
+
+Complete.
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one.
+This is actually one reason why transactions can be faster
+(everything is locked once at transaction start). In the
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of
+TDB, and it didn't make much difference except in manufactured
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux),
+but it means that we lose automatic cleanup when a process dies
+with a lock. There is a method of auto-cleanup under Linux, but
+it's not supported by other operating systems. We could
+reintroduce a clear-if-first-style lock and sweep for dead
+futexes on open, but that wouldn't help the normal case of one
+concurrent opener dying. Increasingly elaborate repair schemes
+could be considered, but they require an ABI change (everyone
+must use them) anyway, so there's no need to do this at the same
+time as everything else.
+
+3.14 Some Transactions Don't Require Durability
+
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
+normal (fast) usage, and occasionally empties the results into a
+transactional TDB. This kind of usage prioritizes performance
+over durability: as long as we are consistent, data can be lost.
+
+This would be more neatly implemented inside tdb: a “soft”
+transaction commit (ie. syncless) which meant that data may be
+reverted on a crash.
+
+3.14.1 Proposed Solution
+
+None.
+
+Unfortunately any transaction scheme which overwrites old data
+requires a sync before that overwrite to avoid the possibility of
+corruption.
+
+It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
+,where transactions are committed without overwriting existing
+data, and an array of top-level pointers were available in the
+header. If the transaction is “soft” then we would not need a
+sync at all: existing processes would pick up the new hash table
+and free list and work with that.
+
+At some later point, a sync would allow recovery of the old data
+into the free lists (perhaps when the array of top-level pointers
+filled). On crash, tdb_open() would examine the array of top
+levels, and apply the transactions until it encountered an
+invalid checksum.
+
+3.15 Tracing Is Fragile, Replay Is External
+
+The current TDB has compile-time-enabled tracing code, but it
+often breaks as it is not enabled by default. In a similar way,
+the ctdb code has an external wrapper which does replay tracing
+so it can coordinate cluster-wide transactions.
+
+3.15.1 Proposed Solution<replay-attribute>
+
+Tridge points out that an attribute can be later added to
+tdb_open (see [attributes]) to provide replay/trace hooks, which
+could become the basis for this and future parallel transactions
+and snapshot support.
+
+3.15.2 Status
+
+Deferred.
diff --git a/lib/ntdb/free.c b/lib/ntdb/free.c
new file mode 100644
index 0000000000..0fe6c73775
--- /dev/null
+++ b/lib/ntdb/free.c
@@ -0,0 +1,976 @@
+ /*
+   Trivial Database 2: free list/block handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/ilog/ilog.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+
+static unsigned fls64(uint64_t val)
+{
+	return ilog64(val);
+}
+
+/* In which bucket would we find a particular record size? (ignoring header) */
+unsigned int size_to_bucket(ntdb_len_t data_len)
+{
+	unsigned int bucket;
+
+	/* We can't have records smaller than this. */
+	assert(data_len >= NTDB_MIN_DATA_LEN);
+
+	/* Ignoring the header... */
+	if (data_len - NTDB_MIN_DATA_LEN <= 64) {
+		/* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
+		bucket = (data_len - NTDB_MIN_DATA_LEN) / 8;
+	} else {
+		/* After that we go power of 2. */
+		bucket = fls64(data_len - NTDB_MIN_DATA_LEN) + 2;
+	}
+
+	if (unlikely(bucket >= NTDB_FREE_BUCKETS))
+		bucket = NTDB_FREE_BUCKETS - 1;
+	return bucket;
+}
+
+ntdb_off_t first_ftable(struct ntdb_context *ntdb)
+{
+	return ntdb_read_off(ntdb, offsetof(struct ntdb_header, free_table));
+}
+
+ntdb_off_t next_ftable(struct ntdb_context *ntdb, ntdb_off_t ftable)
+{
+	return ntdb_read_off(ntdb, ftable + offsetof(struct ntdb_freetable,next));
+}
+
+enum NTDB_ERROR ntdb_ftable_init(struct ntdb_context *ntdb)
+{
+	/* Use reservoir sampling algorithm to select a free list at random. */
+	unsigned int rnd, max = 0, count = 0;
+	ntdb_off_t off;
+
+	ntdb->ftable_off = off = first_ftable(ntdb);
+	ntdb->ftable = 0;
+
+	while (off) {
+		if (NTDB_OFF_IS_ERR(off)) {
+			return NTDB_OFF_TO_ERR(off);
+		}
+
+		rnd = random();
+		if (rnd >= max) {
+			ntdb->ftable_off = off;
+			ntdb->ftable = count;
+			max = rnd;
+		}
+
+		off = next_ftable(ntdb, off);
+		count++;
+	}
+	return NTDB_SUCCESS;
+}
+
+/* Offset of a given bucket. */
+ntdb_off_t bucket_off(ntdb_off_t ftable_off, unsigned bucket)
+{
+	return ftable_off + offsetof(struct ntdb_freetable, buckets)
+		+ bucket * sizeof(ntdb_off_t);
+}
+
+/* Returns free_buckets + 1, or list number to search, or -ve error. */
+static ntdb_off_t find_free_head(struct ntdb_context *ntdb,
+				ntdb_off_t ftable_off,
+				ntdb_off_t bucket)
+{
+	/* Speculatively search for a non-zero bucket. */
+	return ntdb_find_nonzero_off(ntdb, bucket_off(ftable_off, 0),
+				    bucket, NTDB_FREE_BUCKETS);
+}
+
+static void check_list(struct ntdb_context *ntdb, ntdb_off_t b_off)
+{
+#ifdef CCAN_NTDB_DEBUG
+	ntdb_off_t off, prev = 0, first;
+	struct ntdb_free_record r;
+
+	first = off = (ntdb_read_off(ntdb, b_off) & NTDB_OFF_MASK);
+	while (off != 0) {
+		ntdb_read_convert(ntdb, off, &r, sizeof(r));
+		if (frec_magic(&r) != NTDB_FREE_MAGIC)
+			abort();
+		if (prev && frec_prev(&r) != prev)
+			abort();
+		prev = off;
+		off = r.next;
+	}
+
+	if (first) {
+		ntdb_read_convert(ntdb, first, &r, sizeof(r));
+		if (frec_prev(&r) != prev)
+			abort();
+	}
+#endif
+}
+
+/* Remove from free bucket. */
+static enum NTDB_ERROR remove_from_list(struct ntdb_context *ntdb,
+				       ntdb_off_t b_off, ntdb_off_t r_off,
+				       const struct ntdb_free_record *r)
+{
+	ntdb_off_t off, prev_next, head;
+	enum NTDB_ERROR ecode;
+
+	/* Is this only element in list?  Zero out bucket, and we're done. */
+	if (frec_prev(r) == r_off)
+		return ntdb_write_off(ntdb, b_off, 0);
+
+	/* off = &r->prev->next */
+	off = frec_prev(r) + offsetof(struct ntdb_free_record, next);
+
+	/* Get prev->next */
+	prev_next = ntdb_read_off(ntdb, off);
+	if (NTDB_OFF_IS_ERR(prev_next))
+		return NTDB_OFF_TO_ERR(prev_next);
+
+	/* If prev->next == 0, we were head: update bucket to point to next. */
+	if (prev_next == 0) {
+		/* We must preserve upper bits. */
+		head = ntdb_read_off(ntdb, b_off);
+		if (NTDB_OFF_IS_ERR(head))
+			return NTDB_OFF_TO_ERR(head);
+
+		if ((head & NTDB_OFF_MASK) != r_off) {
+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+					  "remove_from_list:"
+					  " %llu head %llu on list %llu",
+					  (long long)r_off,
+					  (long long)head,
+					  (long long)b_off);
+		}
+		head = ((head & ~NTDB_OFF_MASK) | r->next);
+		ecode = ntdb_write_off(ntdb, b_off, head);
+		if (ecode != NTDB_SUCCESS)
+			return ecode;
+	} else {
+		/* r->prev->next = r->next */
+		ecode = ntdb_write_off(ntdb, off, r->next);
+		if (ecode != NTDB_SUCCESS)
+			return ecode;
+	}
+
+	/* If we were the tail, off = &head->prev. */
+	if (r->next == 0) {
+		head = ntdb_read_off(ntdb, b_off);
+		if (NTDB_OFF_IS_ERR(head))
+			return NTDB_OFF_TO_ERR(head);
+		head &= NTDB_OFF_MASK;
+		off = head + offsetof(struct ntdb_free_record, magic_and_prev);
+	} else {
+		/* off = &r->next->prev */
+		off = r->next + offsetof(struct ntdb_free_record,
+					 magic_and_prev);
+	}
+
+#ifdef CCAN_NTDB_DEBUG
+	/* *off == r */
+	if ((ntdb_read_off(ntdb, off) & NTDB_OFF_MASK) != r_off) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "remove_from_list:"
+				  " %llu bad prev in list %llu",
+				  (long long)r_off, (long long)b_off);
+	}
+#endif
+	/* r->next->prev = r->prev */
+	return ntdb_write_off(ntdb, off, r->magic_and_prev);
+}
+
+/* Enqueue in this free bucket: sets coalesce if we've added 128
+ * entries to it. */
+static enum NTDB_ERROR enqueue_in_free(struct ntdb_context *ntdb,
+				      ntdb_off_t b_off,
+				      ntdb_off_t off,
+				      ntdb_len_t len,
+				      bool *coalesce)
+{
+	struct ntdb_free_record new;
+	enum NTDB_ERROR ecode;
+	ntdb_off_t prev, head;
+	uint64_t magic = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL));
+
+	head = ntdb_read_off(ntdb, b_off);
+	if (NTDB_OFF_IS_ERR(head))
+		return NTDB_OFF_TO_ERR(head);
+
+	/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
+	new.ftable_and_len = ((uint64_t)ntdb->ftable
+			      << (64 - NTDB_OFF_UPPER_STEAL))
+		| len;
+
+	/* new->next = head. */
+	new.next = (head & NTDB_OFF_MASK);
+
+	/* First element?  Prev points to ourselves. */
+	if (!new.next) {
+		new.magic_and_prev = (magic | off);
+	} else {
+		/* new->prev = next->prev */
+		prev = ntdb_read_off(ntdb,
+				    new.next + offsetof(struct ntdb_free_record,
+							magic_and_prev));
+		new.magic_and_prev = prev;
+		if (frec_magic(&new) != NTDB_FREE_MAGIC) {
+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+					  "enqueue_in_free: %llu bad head"
+					  " prev %llu",
+					  (long long)new.next,
+					  (long long)prev);
+		}
+		/* next->prev = new. */
+		ecode = ntdb_write_off(ntdb, new.next
+				      + offsetof(struct ntdb_free_record,
+						 magic_and_prev),
+				      off | magic);
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+
+#ifdef CCAN_NTDB_DEBUG
+		prev = ntdb_read_off(ntdb, frec_prev(&new)
+				    + offsetof(struct ntdb_free_record, next));
+		if (prev != 0) {
+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+					  "enqueue_in_free:"
+					  " %llu bad tail next ptr %llu",
+					  (long long)frec_prev(&new)
+					  + offsetof(struct ntdb_free_record,
+						     next),
+					  (long long)prev);
+		}
+#endif
+	}
+
+	/* Update enqueue count, but don't set high bit: see NTDB_OFF_IS_ERR */
+	if (*coalesce)
+		head += (1ULL << (64 - NTDB_OFF_UPPER_STEAL));
+	head &= ~(NTDB_OFF_MASK | (1ULL << 63));
+	head |= off;
+
+	ecode = ntdb_write_off(ntdb, b_off, head);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* It's time to coalesce if counter wrapped. */
+	if (*coalesce)
+		*coalesce = ((head & ~NTDB_OFF_MASK) == 0);
+
+	return ntdb_write_convert(ntdb, off, &new, sizeof(new));
+}
+
+static ntdb_off_t ftable_offset(struct ntdb_context *ntdb, unsigned int ftable)
+{
+	ntdb_off_t off;
+	unsigned int i;
+
+	if (likely(ntdb->ftable == ftable))
+		return ntdb->ftable_off;
+
+	off = first_ftable(ntdb);
+	for (i = 0; i < ftable; i++) {
+		if (NTDB_OFF_IS_ERR(off)) {
+			break;
+		}
+		off = next_ftable(ntdb, off);
+	}
+	return off;
+}
+
+/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
+ * need to blatt the *protect record (which is set to an error). */
+static ntdb_len_t coalesce(struct ntdb_context *ntdb,
+			  ntdb_off_t off, ntdb_off_t b_off,
+			  ntdb_len_t data_len,
+			  ntdb_off_t *protect)
+{
+	ntdb_off_t end;
+	struct ntdb_free_record rec;
+	enum NTDB_ERROR ecode;
+
+	ntdb->stats.alloc_coalesce_tried++;
+	end = off + sizeof(struct ntdb_used_record) + data_len;
+
+	while (end < ntdb->file->map_size) {
+		const struct ntdb_free_record *r;
+		ntdb_off_t nb_off;
+		unsigned ftable, bucket;
+
+		r = ntdb_access_read(ntdb, end, sizeof(*r), true);
+		if (NTDB_PTR_IS_ERR(r)) {
+			ecode = NTDB_PTR_ERR(r);
+			goto err;
+		}
+
+		if (frec_magic(r) != NTDB_FREE_MAGIC
+		    || frec_ftable(r) == NTDB_FTABLE_NONE) {
+			ntdb_access_release(ntdb, r);
+			break;
+		}
+
+		ftable = frec_ftable(r);
+		bucket = size_to_bucket(frec_len(r));
+		nb_off = ftable_offset(ntdb, ftable);
+		if (NTDB_OFF_IS_ERR(nb_off)) {
+			ntdb_access_release(ntdb, r);
+			ecode = NTDB_OFF_TO_ERR(nb_off);
+			goto err;
+		}
+		nb_off = bucket_off(nb_off, bucket);
+		ntdb_access_release(ntdb, r);
+
+		/* We may be violating lock order here, so best effort. */
+		if (ntdb_lock_free_bucket(ntdb, nb_off, NTDB_LOCK_NOWAIT)
+		    != NTDB_SUCCESS) {
+			ntdb->stats.alloc_coalesce_lockfail++;
+			break;
+		}
+
+		/* Now we have lock, re-check. */
+		ecode = ntdb_read_convert(ntdb, end, &rec, sizeof(rec));
+		if (ecode != NTDB_SUCCESS) {
+			ntdb_unlock_free_bucket(ntdb, nb_off);
+			goto err;
+		}
+
+		if (unlikely(frec_magic(&rec) != NTDB_FREE_MAGIC)) {
+			ntdb->stats.alloc_coalesce_race++;
+			ntdb_unlock_free_bucket(ntdb, nb_off);
+			break;
+		}
+
+		if (unlikely(frec_ftable(&rec) != ftable)
+		    || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
+			ntdb->stats.alloc_coalesce_race++;
+			ntdb_unlock_free_bucket(ntdb, nb_off);
+			break;
+		}
+
+		/* Did we just mess up a record you were hoping to use? */
+		if (end == *protect) {
+			ntdb->stats.alloc_coalesce_iterate_clash++;
+			*protect = NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST);
+		}
+
+		ecode = remove_from_list(ntdb, nb_off, end, &rec);
+		check_list(ntdb, nb_off);
+		if (ecode != NTDB_SUCCESS) {
+			ntdb_unlock_free_bucket(ntdb, nb_off);
+			goto err;
+		}
+
+		end += sizeof(struct ntdb_used_record) + frec_len(&rec);
+		ntdb_unlock_free_bucket(ntdb, nb_off);
+		ntdb->stats.alloc_coalesce_num_merged++;
+	}
+
+	/* Didn't find any adjacent free? */
+	if (end == off + sizeof(struct ntdb_used_record) + data_len)
+		return 0;
+
+	/* Before we expand, check this isn't one you wanted protected? */
+	if (off == *protect) {
+		*protect = NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS);
+		ntdb->stats.alloc_coalesce_iterate_clash++;
+	}
+
+	/* OK, expand initial record */
+	ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+	if (ecode != NTDB_SUCCESS) {
+		goto err;
+	}
+
+	if (frec_len(&rec) != data_len) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				   "coalesce: expected data len %zu not %zu",
+				   (size_t)data_len, (size_t)frec_len(&rec));
+		goto err;
+	}
+
+	ecode = remove_from_list(ntdb, b_off, off, &rec);
+	check_list(ntdb, b_off);
+	if (ecode != NTDB_SUCCESS) {
+		goto err;
+	}
+
+	/* Try locking violation first.  We don't allow coalesce recursion! */
+	ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_NOWAIT, false);
+	if (ecode != NTDB_SUCCESS) {
+		/* Need to drop lock.  Can't rely on anything stable. */
+		ntdb->stats.alloc_coalesce_lockfail++;
+		*protect = NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT);
+
+		/* We have to drop this to avoid deadlocks, so make sure record
+		 * doesn't get coalesced by someone else! */
+		rec.ftable_and_len = (NTDB_FTABLE_NONE
+				      << (64 - NTDB_OFF_UPPER_STEAL))
+			| (end - off - sizeof(struct ntdb_used_record));
+		ecode = ntdb_write_off(ntdb,
+				      off + offsetof(struct ntdb_free_record,
+						     ftable_and_len),
+				      rec.ftable_and_len);
+		if (ecode != NTDB_SUCCESS) {
+			goto err;
+		}
+
+		ntdb_unlock_free_bucket(ntdb, b_off);
+
+		ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_WAIT,
+					false);
+		if (ecode != NTDB_SUCCESS) {
+			return NTDB_ERR_TO_OFF(ecode);
+		}
+	} else if (NTDB_OFF_IS_ERR(*protect)) {
+		/* For simplicity, we always drop lock if they can't continue */
+		ntdb_unlock_free_bucket(ntdb, b_off);
+	}
+	ntdb->stats.alloc_coalesce_succeeded++;
+
+	/* Return usable length. */
+	return end - off - sizeof(struct ntdb_used_record);
+
+err:
+	/* To unify error paths, we *always* unlock bucket on error. */
+	ntdb_unlock_free_bucket(ntdb, b_off);
+	return NTDB_ERR_TO_OFF(ecode);
+}
+
+/* List is locked: we unlock it. */
+static enum NTDB_ERROR coalesce_list(struct ntdb_context *ntdb,
+				    ntdb_off_t ftable_off,
+				    ntdb_off_t b_off,
+				    unsigned int limit)
+{
+	enum NTDB_ERROR ecode;
+	ntdb_off_t off;
+
+	off = ntdb_read_off(ntdb, b_off);
+	if (NTDB_OFF_IS_ERR(off)) {
+		ecode = NTDB_OFF_TO_ERR(off);
+		goto unlock_err;
+	}
+	/* A little bit of paranoia: counter should be 0. */
+	off &= NTDB_OFF_MASK;
+
+	while (off && limit--) {
+		struct ntdb_free_record rec;
+		ntdb_len_t coal;
+		ntdb_off_t next;
+
+		ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+
+		next = rec.next;
+		coal = coalesce(ntdb, off, b_off, frec_len(&rec), &next);
+		if (NTDB_OFF_IS_ERR(coal)) {
+			/* This has already unlocked on error. */
+			return NTDB_OFF_TO_ERR(coal);
+		}
+		if (NTDB_OFF_IS_ERR(next)) {
+			/* Coalescing had to unlock, so stop. */
+			return NTDB_SUCCESS;
+		}
+		/* Keep going if we're doing well... */
+		limit += size_to_bucket(coal / 16 + NTDB_MIN_DATA_LEN);
+		off = next;
+	}
+
+	/* Now, move those elements to the tail of the list so we get something
+	 * else next time. */
+	if (off) {
+		struct ntdb_free_record oldhrec, newhrec, oldtrec, newtrec;
+		ntdb_off_t oldhoff, oldtoff, newtoff;
+
+		/* The record we were up to is the new head. */
+		ecode = ntdb_read_convert(ntdb, off, &newhrec, sizeof(newhrec));
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+
+		/* Get the new tail. */
+		newtoff = frec_prev(&newhrec);
+		ecode = ntdb_read_convert(ntdb, newtoff, &newtrec,
+					 sizeof(newtrec));
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+
+		/* Get the old head. */
+		oldhoff = ntdb_read_off(ntdb, b_off);
+		if (NTDB_OFF_IS_ERR(oldhoff)) {
+			ecode = NTDB_OFF_TO_ERR(oldhoff);
+			goto unlock_err;
+		}
+
+		/* This could happen if they all coalesced away. */
+		if (oldhoff == off)
+			goto out;
+
+		ecode = ntdb_read_convert(ntdb, oldhoff, &oldhrec,
+					 sizeof(oldhrec));
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+
+		/* Get the old tail. */
+		oldtoff = frec_prev(&oldhrec);
+		ecode = ntdb_read_convert(ntdb, oldtoff, &oldtrec,
+					 sizeof(oldtrec));
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+
+		/* Old tail's next points to old head. */
+		oldtrec.next = oldhoff;
+
+		/* Old head's prev points to old tail. */
+		oldhrec.magic_and_prev
+			= (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL))
+			| oldtoff;
+
+		/* New tail's next is 0. */
+		newtrec.next = 0;
+
+		/* Write out the modified versions. */
+		ecode = ntdb_write_convert(ntdb, oldtoff, &oldtrec,
+					  sizeof(oldtrec));
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+
+		ecode = ntdb_write_convert(ntdb, oldhoff, &oldhrec,
+					  sizeof(oldhrec));
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+
+		ecode = ntdb_write_convert(ntdb, newtoff, &newtrec,
+					  sizeof(newtrec));
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+
+		/* And finally link in new head. */
+		ecode = ntdb_write_off(ntdb, b_off, off);
+		if (ecode != NTDB_SUCCESS)
+			goto unlock_err;
+	}
+out:
+	ntdb_unlock_free_bucket(ntdb, b_off);
+	return NTDB_SUCCESS;
+
+unlock_err:
+	ntdb_unlock_free_bucket(ntdb, b_off);
+	return ecode;
+}
+
+/* List must not be locked if coalesce_ok is set. */
+enum NTDB_ERROR add_free_record(struct ntdb_context *ntdb,
+			       ntdb_off_t off, ntdb_len_t len_with_header,
+			       enum ntdb_lock_flags waitflag,
+			       bool coalesce_ok)
+{
+	ntdb_off_t b_off;
+	ntdb_len_t len;
+	enum NTDB_ERROR ecode;
+
+	assert(len_with_header >= sizeof(struct ntdb_free_record));
+
+	len = len_with_header - sizeof(struct ntdb_used_record);
+
+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(len));
+	ecode = ntdb_lock_free_bucket(ntdb, b_off, waitflag);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	ecode = enqueue_in_free(ntdb, b_off, off, len, &coalesce_ok);
+	check_list(ntdb, b_off);
+
+	/* Coalescing unlocks free list. */
+	if (!ecode && coalesce_ok)
+		ecode = coalesce_list(ntdb, ntdb->ftable_off, b_off, 2);
+	else
+		ntdb_unlock_free_bucket(ntdb, b_off);
+	return ecode;
+}
+
+static size_t adjust_size(size_t keylen, size_t datalen)
+{
+	size_t size = keylen + datalen;
+
+	if (size < NTDB_MIN_DATA_LEN)
+		size = NTDB_MIN_DATA_LEN;
+
+	/* Round to next uint64_t boundary. */
+	return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
+}
+
+/* If we have enough left over to be useful, split that off. */
+static size_t record_leftover(size_t keylen, size_t datalen,
+			      bool want_extra, size_t total_len)
+{
+	ssize_t leftover;
+
+	if (want_extra)
+		datalen += datalen / 2;
+	leftover = total_len - adjust_size(keylen, datalen);
+
+	if (leftover < (ssize_t)sizeof(struct ntdb_free_record))
+		return 0;
+
+	return leftover;
+}
+
+/* We need size bytes to put our key and data in. */
+static ntdb_off_t lock_and_alloc(struct ntdb_context *ntdb,
+				ntdb_off_t ftable_off,
+				ntdb_off_t bucket,
+				size_t keylen, size_t datalen,
+				bool want_extra,
+				unsigned magic,
+				unsigned hashlow)
+{
+	ntdb_off_t off, b_off,best_off;
+	struct ntdb_free_record best = { 0 };
+	double multiplier;
+	size_t size = adjust_size(keylen, datalen);
+	enum NTDB_ERROR ecode;
+
+	ntdb->stats.allocs++;
+	b_off = bucket_off(ftable_off, bucket);
+
+	/* FIXME: Try non-blocking wait first, to measure contention. */
+	/* Lock this bucket. */
+	ecode = ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT);
+	if (ecode != NTDB_SUCCESS) {
+		return NTDB_ERR_TO_OFF(ecode);
+	}
+
+	best.ftable_and_len = -1ULL;
+	best_off = 0;
+
+	/* Get slack if we're after extra. */
+	if (want_extra)
+		multiplier = 1.5;
+	else
+		multiplier = 1.0;
+
+	/* Walk the list to see if any are large enough, getting less fussy
+	 * as we go. */
+	off = ntdb_read_off(ntdb, b_off);
+	if (NTDB_OFF_IS_ERR(off)) {
+		ecode = NTDB_OFF_TO_ERR(off);
+		goto unlock_err;
+	}
+	off &= NTDB_OFF_MASK;
+
+	while (off) {
+		const struct ntdb_free_record *r;
+		ntdb_len_t len;
+		ntdb_off_t next;
+
+		r = ntdb_access_read(ntdb, off, sizeof(*r), true);
+		if (NTDB_PTR_IS_ERR(r)) {
+			ecode = NTDB_PTR_ERR(r);
+			goto unlock_err;
+		}
+
+		if (frec_magic(r) != NTDB_FREE_MAGIC) {
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+					   "lock_and_alloc:"
+					   " %llu non-free 0x%llx",
+					   (long long)off,
+					   (long long)r->magic_and_prev);
+			ntdb_access_release(ntdb, r);
+			goto unlock_err;
+		}
+
+		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
+			best_off = off;
+			best = *r;
+		}
+
+		if (frec_len(&best) <= size * multiplier && best_off) {
+			ntdb_access_release(ntdb, r);
+			break;
+		}
+
+		multiplier *= 1.01;
+
+		next = r->next;
+		len = frec_len(r);
+		ntdb_access_release(ntdb, r);
+		off = next;
+	}
+
+	/* If we found anything at all, use it. */
+	if (best_off) {
+		struct ntdb_used_record rec;
+		size_t leftover;
+
+		/* We're happy with this size: take it. */
+		ecode = remove_from_list(ntdb, b_off, best_off, &best);
+		check_list(ntdb, b_off);
+		if (ecode != NTDB_SUCCESS) {
+			goto unlock_err;
+		}
+
+		leftover = record_leftover(keylen, datalen, want_extra,
+					   frec_len(&best));
+
+		assert(keylen + datalen + leftover <= frec_len(&best));
+		/* We need to mark non-free before we drop lock, otherwise
+		 * coalesce() could try to merge it! */
+		ecode = set_header(ntdb, &rec, magic, keylen, datalen,
+				   frec_len(&best) - leftover, hashlow);
+		if (ecode != NTDB_SUCCESS) {
+			goto unlock_err;
+		}
+
+		ecode = ntdb_write_convert(ntdb, best_off, &rec, sizeof(rec));
+		if (ecode != NTDB_SUCCESS) {
+			goto unlock_err;
+		}
+
+		/* For futureproofing, we put a 0 in any unused space. */
+		if (rec_extra_padding(&rec)) {
+			ecode = ntdb->io->twrite(ntdb, best_off + sizeof(rec)
+						+ keylen + datalen, "", 1);
+			if (ecode != NTDB_SUCCESS) {
+				goto unlock_err;
+			}
+		}
+
+		/* Bucket of leftover will be <= current bucket, so nested
+		 * locking is allowed. */
+		if (leftover) {
+			ntdb->stats.alloc_leftover++;
+			ecode = add_free_record(ntdb,
+						best_off + sizeof(rec)
+						+ frec_len(&best) - leftover,
+						leftover, NTDB_LOCK_WAIT, false);
+			if (ecode != NTDB_SUCCESS) {
+				best_off = NTDB_ERR_TO_OFF(ecode);
+			}
+		}
+		ntdb_unlock_free_bucket(ntdb, b_off);
+
+		return best_off;
+	}
+
+	ntdb_unlock_free_bucket(ntdb, b_off);
+	return 0;
+
+unlock_err:
+	ntdb_unlock_free_bucket(ntdb, b_off);
+	return NTDB_ERR_TO_OFF(ecode);
+}
+
+/* Get a free block from current free list, or 0 if none, -ve on error. */
+static ntdb_off_t get_free(struct ntdb_context *ntdb,
+			  size_t keylen, size_t datalen, bool want_extra,
+			  unsigned magic, unsigned hashlow)
+{
+	ntdb_off_t off, ftable_off;
+	ntdb_off_t start_b, b, ftable;
+	bool wrapped = false;
+
+	/* If they are growing, add 50% to get to higher bucket. */
+	if (want_extra)
+		start_b = size_to_bucket(adjust_size(keylen,
+						     datalen + datalen / 2));
+	else
+		start_b = size_to_bucket(adjust_size(keylen, datalen));
+
+	ftable_off = ntdb->ftable_off;
+	ftable = ntdb->ftable;
+	while (!wrapped || ftable_off != ntdb->ftable_off) {
+		/* Start at exact size bucket, and search up... */
+		for (b = find_free_head(ntdb, ftable_off, start_b);
+		     b < NTDB_FREE_BUCKETS;
+		     b = find_free_head(ntdb, ftable_off, b + 1)) {
+			/* Try getting one from list. */
+			off = lock_and_alloc(ntdb, ftable_off,
+					     b, keylen, datalen, want_extra,
+					     magic, hashlow);
+			if (NTDB_OFF_IS_ERR(off))
+				return off;
+			if (off != 0) {
+				if (b == start_b)
+					ntdb->stats.alloc_bucket_exact++;
+				if (b == NTDB_FREE_BUCKETS - 1)
+					ntdb->stats.alloc_bucket_max++;
+				/* Worked?  Stay using this list. */
+				ntdb->ftable_off = ftable_off;
+				ntdb->ftable = ftable;
+				return off;
+			}
+			/* Didn't work.  Try next bucket. */
+		}
+
+		if (NTDB_OFF_IS_ERR(b)) {
+			return b;
+		}
+
+		/* Hmm, try next table. */
+		ftable_off = next_ftable(ntdb, ftable_off);
+		if (NTDB_OFF_IS_ERR(ftable_off)) {
+			return ftable_off;
+		}
+		ftable++;
+
+		if (ftable_off == 0) {
+			wrapped = true;
+			ftable_off = first_ftable(ntdb);
+			if (NTDB_OFF_IS_ERR(ftable_off)) {
+				return ftable_off;
+			}
+			ftable = 0;
+		}
+	}
+
+	return 0;
+}
+
+enum NTDB_ERROR set_header(struct ntdb_context *ntdb,
+			  struct ntdb_used_record *rec,
+			  unsigned magic, uint64_t keylen, uint64_t datalen,
+			  uint64_t actuallen, unsigned hashlow)
+{
+	uint64_t keybits = (fls64(keylen) + 1) / 2;
+
+	/* Use bottom bits of hash, so it's independent of hash table size. */
+	rec->magic_and_meta = (hashlow & ((1 << 11)-1))
+		| ((actuallen - (keylen + datalen)) << 11)
+		| (keybits << 43)
+		| ((uint64_t)magic << 48);
+	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
+
+	/* Encoding can fail on big values. */
+	if (rec_key_length(rec) != keylen
+	    || rec_data_length(rec) != datalen
+	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "Could not encode k=%llu,d=%llu,a=%llu",
+				  (long long)keylen, (long long)datalen,
+				  (long long)actuallen);
+	}
+	return NTDB_SUCCESS;
+}
+
+/* You need 'size', this tells you how much you should expand by. */
+ntdb_off_t ntdb_expand_adjust(ntdb_off_t map_size, ntdb_off_t size)
+{
+	ntdb_off_t new_size, top_size;
+
+	/* limit size in order to avoid using up huge amounts of memory for
+	 * in memory tdbs if an oddball huge record creeps in */
+	if (size > 100 * 1024) {
+		top_size = map_size + size * 2;
+	} else {
+		top_size = map_size + size * 100;
+	}
+
+	/* always make room for at least top_size more records, and at
+	   least 25% more space. if the DB is smaller than 100MiB,
+	   otherwise grow it by 10% only. */
+	if (map_size > 100 * 1024 * 1024) {
+		new_size = map_size * 1.10;
+	} else {
+		new_size = map_size * 1.25;
+	}
+
+	/* Round the database up to a multiple of the page size */
+	if (new_size < top_size)
+		new_size = top_size;
+	return new_size - map_size;
+}
+
+/* Expand the database. */
+static enum NTDB_ERROR ntdb_expand(struct ntdb_context *ntdb, ntdb_len_t size)
+{
+	uint64_t old_size;
+	ntdb_len_t wanted;
+	enum NTDB_ERROR ecode;
+
+	/* Need to hold a hash lock to expand DB: transactions rely on it. */
+	if (!(ntdb->flags & NTDB_NOLOCK)
+	    && !ntdb->file->allrecord_lock.count && !ntdb_has_hash_locks(ntdb)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_expand: must hold lock during expand");
+	}
+
+	/* Only one person can expand file at a time. */
+	ecode = ntdb_lock_expand(ntdb, F_WRLCK);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* Someone else may have expanded the file, so retry. */
+	old_size = ntdb->file->map_size;
+	ntdb->io->oob(ntdb, ntdb->file->map_size, 1, true);
+	if (ntdb->file->map_size != old_size) {
+		ntdb_unlock_expand(ntdb, F_WRLCK);
+		return NTDB_SUCCESS;
+	}
+
+	/* Overallocate. */
+	wanted = ntdb_expand_adjust(old_size, size);
+	/* We need room for the record header too. */
+	wanted = adjust_size(0, sizeof(struct ntdb_used_record) + wanted);
+
+	ecode = ntdb->io->expand_file(ntdb, wanted);
+	if (ecode != NTDB_SUCCESS) {
+		ntdb_unlock_expand(ntdb, F_WRLCK);
+		return ecode;
+	}
+
+	/* We need to drop this lock before adding free record. */
+	ntdb_unlock_expand(ntdb, F_WRLCK);
+
+	ntdb->stats.expands++;
+	return add_free_record(ntdb, old_size, wanted, NTDB_LOCK_WAIT, true);
+}
+
+/* This won't fail: it will expand the database if it has to. */
+ntdb_off_t alloc(struct ntdb_context *ntdb, size_t keylen, size_t datalen,
+		uint64_t hash, unsigned magic, bool growing)
+{
+	ntdb_off_t off;
+
+	/* We can't hold pointers during this: we could unmap! */
+	assert(!ntdb->direct_access);
+
+	for (;;) {
+		enum NTDB_ERROR ecode;
+		off = get_free(ntdb, keylen, datalen, growing, magic, hash);
+		if (likely(off != 0))
+			break;
+
+		ecode = ntdb_expand(ntdb, adjust_size(keylen, datalen));
+		if (ecode != NTDB_SUCCESS) {
+			return NTDB_ERR_TO_OFF(ecode);
+		}
+	}
+
+	return off;
+}
diff --git a/lib/ntdb/hash.c b/lib/ntdb/hash.c
new file mode 100644
index 0000000000..95b98c0736
--- /dev/null
+++ b/lib/ntdb/hash.c
@@ -0,0 +1,894 @@
+ /*
+   Trivial Database 2: hash handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/hash/hash.h>
+#include <assert.h>
+
+/* Default hash function. */
+uint64_t ntdb_jenkins_hash(const void *key, size_t length, uint64_t seed,
+			  void *unused)
+{
+	uint64_t ret;
+	/* hash64_stable assumes lower bits are more important; they are a
+	 * slightly better hash.  We use the upper bits first, so swap them. */
+	ret = hash64_stable((const unsigned char *)key, length, seed);
+	return (ret >> 32) | (ret << 32);
+}
+
+uint64_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
+{
+	return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
+}
+
+uint64_t hash_record(struct ntdb_context *ntdb, ntdb_off_t off)
+{
+	const struct ntdb_used_record *r;
+	const void *key;
+	uint64_t klen, hash;
+
+	r = ntdb_access_read(ntdb, off, sizeof(*r), true);
+	if (NTDB_PTR_IS_ERR(r)) {
+		/* FIXME */
+		return 0;
+	}
+
+	klen = rec_key_length(r);
+	ntdb_access_release(ntdb, r);
+
+	key = ntdb_access_read(ntdb, off + sizeof(*r), klen, false);
+	if (NTDB_PTR_IS_ERR(key)) {
+		return 0;
+	}
+
+	hash = ntdb_hash(ntdb, key, klen);
+	ntdb_access_release(ntdb, key);
+	return hash;
+}
+
+/* Get bits from a value. */
+static uint32_t bits_from(uint64_t val, unsigned start, unsigned num)
+{
+	assert(num <= 32);
+	return (val >> start) & ((1U << num) - 1);
+}
+
+/* We take bits from the top: that way we can lock whole sections of the hash
+ * by using lock ranges. */
+static uint32_t use_bits(struct hash_info *h, unsigned num)
+{
+	h->hash_used += num;
+	return bits_from(h->h, 64 - h->hash_used, num);
+}
+
+static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
+				const struct ntdb_used_record *rec,
+				ntdb_off_t off,
+				const NTDB_DATA *key)
+{
+	ntdb_bool_err ret = false;
+	const char *rkey;
+
+	if (rec_key_length(rec) != key->dsize) {
+		ntdb->stats.compare_wrong_keylen++;
+		return ret;
+	}
+
+	rkey = ntdb_access_read(ntdb, off + sizeof(*rec), key->dsize, false);
+	if (NTDB_PTR_IS_ERR(rkey)) {
+		return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
+	}
+	if (memcmp(rkey, key->dptr, key->dsize) == 0)
+		ret = true;
+	else
+		ntdb->stats.compare_wrong_keycmp++;
+	ntdb_access_release(ntdb, rkey);
+	return ret;
+}
+
+/* Does entry match? */
+static ntdb_bool_err match(struct ntdb_context *ntdb,
+			  struct hash_info *h,
+			  const NTDB_DATA *key,
+			  ntdb_off_t val,
+			  struct ntdb_used_record *rec)
+{
+	ntdb_off_t off;
+	enum NTDB_ERROR ecode;
+
+	ntdb->stats.compares++;
+	/* Desired bucket must match. */
+	if (h->home_bucket != (val & NTDB_OFF_HASH_GROUP_MASK)) {
+		ntdb->stats.compare_wrong_bucket++;
+		return false;
+	}
+
+	/* Top bits of offset == next bits of hash. */
+	if (bits_from(val, NTDB_OFF_HASH_EXTRA_BIT, NTDB_OFF_UPPER_STEAL_EXTRA)
+	    != bits_from(h->h, 64 - h->hash_used - NTDB_OFF_UPPER_STEAL_EXTRA,
+		    NTDB_OFF_UPPER_STEAL_EXTRA)) {
+		ntdb->stats.compare_wrong_offsetbits++;
+		return false;
+	}
+
+	off = val & NTDB_OFF_MASK;
+	ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
+	if (ecode != NTDB_SUCCESS) {
+		return (ntdb_bool_err)ecode;
+	}
+
+	if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
+		ntdb->stats.compare_wrong_rechash++;
+		return false;
+	}
+
+	return key_matches(ntdb, rec, off, key);
+}
+
+static ntdb_off_t hbucket_off(ntdb_off_t group_start, unsigned bucket)
+{
+	return group_start
+		+ (bucket % (1 << NTDB_HASH_GROUP_BITS)) * sizeof(ntdb_off_t);
+}
+
+bool is_subhash(ntdb_off_t val)
+{
+	return (val >> NTDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
+}
+
+/* FIXME: Guess the depth, don't over-lock! */
+static ntdb_off_t hlock_range(ntdb_off_t group, ntdb_off_t *size)
+{
+	*size = 1ULL << (64 - (NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS));
+	return group << (64 - (NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS));
+}
+
+static ntdb_off_t COLD find_in_chain(struct ntdb_context *ntdb,
+				    NTDB_DATA key,
+				    ntdb_off_t chain,
+				    struct hash_info *h,
+				    struct ntdb_used_record *rec,
+				    struct traverse_info *tinfo)
+{
+	ntdb_off_t off, next;
+	enum NTDB_ERROR ecode;
+
+	/* In case nothing is free, we set these to zero. */
+	h->home_bucket = h->found_bucket = 0;
+
+	for (off = chain; off; off = next) {
+		unsigned int i;
+
+		h->group_start = off;
+		ecode = ntdb_read_convert(ntdb, off, h->group, sizeof(h->group));
+		if (ecode != NTDB_SUCCESS) {
+			return NTDB_ERR_TO_OFF(ecode);
+		}
+
+		for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) {
+			ntdb_off_t recoff;
+			if (!h->group[i]) {
+				/* Remember this empty bucket. */
+				h->home_bucket = h->found_bucket = i;
+				continue;
+			}
+
+			/* We can insert extra bits via add_to_hash
+			 * empty bucket logic. */
+			recoff = h->group[i] & NTDB_OFF_MASK;
+			ecode = ntdb_read_convert(ntdb, recoff, rec,
+						 sizeof(*rec));
+			if (ecode != NTDB_SUCCESS) {
+				return NTDB_ERR_TO_OFF(ecode);
+			}
+
+			ecode = NTDB_OFF_TO_ERR(key_matches(ntdb, rec, recoff,
+							   &key));
+			if (ecode < 0) {
+				return NTDB_ERR_TO_OFF(ecode);
+			}
+			if (ecode == (enum NTDB_ERROR)1) {
+				h->home_bucket = h->found_bucket = i;
+
+				if (tinfo) {
+					tinfo->levels[tinfo->num_levels]
+						.hashtable = off;
+					tinfo->levels[tinfo->num_levels]
+						.total_buckets
+						= 1 << NTDB_HASH_GROUP_BITS;
+					tinfo->levels[tinfo->num_levels].entry
+						= i;
+					tinfo->num_levels++;
+				}
+				return recoff;
+			}
+		}
+		next = ntdb_read_off(ntdb, off
+				    + offsetof(struct ntdb_chain, next));
+		if (NTDB_OFF_IS_ERR(next)) {
+			return next;
+		}
+		if (next)
+			next += sizeof(struct ntdb_used_record);
+	}
+	return 0;
+}
+
+/* This is the core routine which searches the hashtable for an entry.
+ * On error, no locks are held and -ve is returned.
+ * Otherwise, hinfo is filled in (and the optional tinfo).
+ * If not found, the return value is 0.
+ * If found, the return value is the offset, and *rec is the record. */
+ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
+			NTDB_DATA key,
+			int ltype,
+			struct hash_info *h,
+			struct ntdb_used_record *rec,
+			struct traverse_info *tinfo)
+{
+	uint32_t i, group;
+	ntdb_off_t hashtable;
+	enum NTDB_ERROR ecode;
+
+	h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
+	h->hash_used = 0;
+	group = use_bits(h, NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS);
+	h->home_bucket = use_bits(h, NTDB_HASH_GROUP_BITS);
+
+	h->hlock_start = hlock_range(group, &h->hlock_range);
+	ecode = ntdb_lock_hashes(ntdb, h->hlock_start, h->hlock_range, ltype,
+				NTDB_LOCK_WAIT);
+	if (ecode != NTDB_SUCCESS) {
+		return NTDB_ERR_TO_OFF(ecode);
+	}
+
+	hashtable = offsetof(struct ntdb_header, hashtable);
+	if (tinfo) {
+		tinfo->toplevel_group = group;
+		tinfo->num_levels = 1;
+		tinfo->levels[0].entry = 0;
+		tinfo->levels[0].hashtable = hashtable
+			+ (group << NTDB_HASH_GROUP_BITS) * sizeof(ntdb_off_t);
+		tinfo->levels[0].total_buckets = 1 << NTDB_HASH_GROUP_BITS;
+	}
+
+	while (h->hash_used <= 64) {
+		/* Read in the hash group. */
+		h->group_start = hashtable
+			+ group * (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS);
+
+		ecode = ntdb_read_convert(ntdb, h->group_start, &h->group,
+					 sizeof(h->group));
+		if (ecode != NTDB_SUCCESS) {
+			goto fail;
+		}
+
+		/* Pointer to another hash table?  Go down... */
+		if (is_subhash(h->group[h->home_bucket])) {
+			hashtable = (h->group[h->home_bucket] & NTDB_OFF_MASK)
+				+ sizeof(struct ntdb_used_record);
+			if (tinfo) {
+				/* When we come back, use *next* bucket */
+				tinfo->levels[tinfo->num_levels-1].entry
+					+= h->home_bucket + 1;
+			}
+			group = use_bits(h, NTDB_SUBLEVEL_HASH_BITS
+					 - NTDB_HASH_GROUP_BITS);
+			h->home_bucket = use_bits(h, NTDB_HASH_GROUP_BITS);
+			if (tinfo) {
+				tinfo->levels[tinfo->num_levels].hashtable
+					= hashtable;
+				tinfo->levels[tinfo->num_levels].total_buckets
+					= 1 << NTDB_SUBLEVEL_HASH_BITS;
+				tinfo->levels[tinfo->num_levels].entry
+					= group << NTDB_HASH_GROUP_BITS;
+				tinfo->num_levels++;
+			}
+			continue;
+		}
+
+		/* It's in this group: search (until 0 or all searched) */
+		for (i = 0, h->found_bucket = h->home_bucket;
+		     i < (1 << NTDB_HASH_GROUP_BITS);
+		     i++, h->found_bucket = ((h->found_bucket+1)
+					     % (1 << NTDB_HASH_GROUP_BITS))) {
+			ntdb_bool_err berr;
+			if (is_subhash(h->group[h->found_bucket]))
+				continue;
+
+			if (!h->group[h->found_bucket])
+				break;
+
+			berr = match(ntdb, h, &key, h->group[h->found_bucket],
+				     rec);
+			if (berr < 0) {
+				ecode = NTDB_OFF_TO_ERR(berr);
+				goto fail;
+			}
+			if (berr) {
+				if (tinfo) {
+					tinfo->levels[tinfo->num_levels-1].entry
+						+= h->found_bucket;
+				}
+				return h->group[h->found_bucket] & NTDB_OFF_MASK;
+			}
+		}
+		/* Didn't find it: h indicates where it would go. */
+		return 0;
+	}
+
+	return find_in_chain(ntdb, key, hashtable, h, rec, tinfo);
+
+fail:
+	ntdb_unlock_hashes(ntdb, h->hlock_start, h->hlock_range, ltype);
+	return NTDB_ERR_TO_OFF(ecode);
+}
+
+/* I wrote a simple test, expanding a hash to 2GB, for the following
+ * cases:
+ * 1) Expanding all the buckets at once,
+ * 2) Expanding the bucket we wanted to place the new entry into.
+ * 3) Expanding the most-populated bucket,
+ *
+ * I measured the worst/average/best density during this process.
+ * 1) 3%/16%/30%
+ * 2) 4%/20%/38%
+ * 3) 6%/22%/41%
+ *
+ * So we figure out the busiest bucket for the moment.
+ */
+static unsigned fullest_bucket(struct ntdb_context *ntdb,
+			       const ntdb_off_t *group,
+			       unsigned new_bucket)
+{
+	unsigned counts[1 << NTDB_HASH_GROUP_BITS] = { 0 };
+	unsigned int i, best_bucket;
+
+	/* Count the new entry. */
+	counts[new_bucket]++;
+	best_bucket = new_bucket;
+
+	for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) {
+		unsigned this_bucket;
+
+		if (is_subhash(group[i]))
+			continue;
+		this_bucket = group[i] & NTDB_OFF_HASH_GROUP_MASK;
+		if (++counts[this_bucket] > counts[best_bucket])
+			best_bucket = this_bucket;
+	}
+
+	return best_bucket;
+}
+
+static bool put_into_group(ntdb_off_t *group,
+			   unsigned bucket, ntdb_off_t encoded)
+{
+	unsigned int i;
+
+	for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) {
+		unsigned b = (bucket + i) % (1 << NTDB_HASH_GROUP_BITS);
+
+		if (group[b] == 0) {
+			group[b] = encoded;
+			return true;
+		}
+	}
+	return false;
+}
+
+static void force_into_group(ntdb_off_t *group,
+			     unsigned bucket, ntdb_off_t encoded)
+{
+	if (!put_into_group(group, bucket, encoded))
+		abort();
+}
+
+static ntdb_off_t encode_offset(ntdb_off_t new_off, struct hash_info *h)
+{
+	return h->home_bucket
+		| new_off
+		| ((uint64_t)bits_from(h->h,
+				  64 - h->hash_used - NTDB_OFF_UPPER_STEAL_EXTRA,
+				  NTDB_OFF_UPPER_STEAL_EXTRA)
+		   << NTDB_OFF_HASH_EXTRA_BIT);
+}
+
+/* Simply overwrite the hash entry we found before. */
+enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
+			       struct hash_info *h,
+			       ntdb_off_t new_off)
+{
+	return ntdb_write_off(ntdb, hbucket_off(h->group_start, h->found_bucket),
+			     encode_offset(new_off, h));
+}
+
+/* We slot in anywhere that's empty in the chain. */
+static enum NTDB_ERROR COLD add_to_chain(struct ntdb_context *ntdb,
+					ntdb_off_t subhash,
+					ntdb_off_t new_off)
+{
+	ntdb_off_t entry;
+	enum NTDB_ERROR ecode;
+
+	entry = ntdb_find_zero_off(ntdb, subhash, 1<<NTDB_HASH_GROUP_BITS);
+	if (NTDB_OFF_IS_ERR(entry)) {
+		return NTDB_OFF_TO_ERR(entry);
+	}
+
+	if (entry == 1 << NTDB_HASH_GROUP_BITS) {
+		ntdb_off_t next;
+
+		next = ntdb_read_off(ntdb, subhash
+				    + offsetof(struct ntdb_chain, next));
+		if (NTDB_OFF_IS_ERR(next)) {
+			return NTDB_OFF_TO_ERR(next);
+		}
+
+		if (!next) {
+			next = alloc(ntdb, 0, sizeof(struct ntdb_chain), 0,
+				     NTDB_CHAIN_MAGIC, false);
+			if (NTDB_OFF_IS_ERR(next))
+				return NTDB_OFF_TO_ERR(next);
+			ecode = zero_out(ntdb,
+					 next+sizeof(struct ntdb_used_record),
+					 sizeof(struct ntdb_chain));
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+			ecode = ntdb_write_off(ntdb, subhash
+					      + offsetof(struct ntdb_chain,
+							 next),
+					      next);
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+		}
+		return add_to_chain(ntdb, next, new_off);
+	}
+
+	return ntdb_write_off(ntdb, subhash + entry * sizeof(ntdb_off_t),
+			     new_off);
+}
+
+/* Add into a newly created subhash. */
+static enum NTDB_ERROR add_to_subhash(struct ntdb_context *ntdb, ntdb_off_t subhash,
+				     unsigned hash_used, ntdb_off_t val)
+{
+	ntdb_off_t off = (val & NTDB_OFF_MASK), *group;
+	struct hash_info h;
+	unsigned int gnum;
+
+	h.hash_used = hash_used;
+
+	if (hash_used + NTDB_SUBLEVEL_HASH_BITS > 64)
+		return add_to_chain(ntdb, subhash, off);
+
+	h.h = hash_record(ntdb, off);
+	gnum = use_bits(&h, NTDB_SUBLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS);
+	h.group_start = subhash
+		+ gnum * (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS);
+	h.home_bucket = use_bits(&h, NTDB_HASH_GROUP_BITS);
+
+	group = ntdb_access_write(ntdb, h.group_start,
+				 sizeof(*group) << NTDB_HASH_GROUP_BITS, true);
+	if (NTDB_PTR_IS_ERR(group)) {
+		return NTDB_PTR_ERR(group);
+	}
+	force_into_group(group, h.home_bucket, encode_offset(off, &h));
+	return ntdb_access_commit(ntdb, group);
+}
+
+static enum NTDB_ERROR expand_group(struct ntdb_context *ntdb, struct hash_info *h)
+{
+	unsigned bucket, num_vals, i, magic;
+	size_t subsize;
+	ntdb_off_t subhash;
+	ntdb_off_t vals[1 << NTDB_HASH_GROUP_BITS];
+	enum NTDB_ERROR ecode;
+
+	/* Attach new empty subhash under fullest bucket. */
+	bucket = fullest_bucket(ntdb, h->group, h->home_bucket);
+
+	if (h->hash_used == 64) {
+		ntdb->stats.alloc_chain++;
+		subsize = sizeof(struct ntdb_chain);
+		magic = NTDB_CHAIN_MAGIC;
+	} else {
+		ntdb->stats.alloc_subhash++;
+		subsize = (sizeof(ntdb_off_t) << NTDB_SUBLEVEL_HASH_BITS);
+		magic = NTDB_HTABLE_MAGIC;
+	}
+
+	subhash = alloc(ntdb, 0, subsize, 0, magic, false);
+	if (NTDB_OFF_IS_ERR(subhash)) {
+		return NTDB_OFF_TO_ERR(subhash);
+	}
+
+	ecode = zero_out(ntdb, subhash + sizeof(struct ntdb_used_record),
+			 subsize);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* Remove any which are destined for bucket or are in wrong place. */
+	num_vals = 0;
+	for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) {
+		unsigned home_bucket = h->group[i] & NTDB_OFF_HASH_GROUP_MASK;
+		if (!h->group[i] || is_subhash(h->group[i]))
+			continue;
+		if (home_bucket == bucket || home_bucket != i) {
+			vals[num_vals++] = h->group[i];
+			h->group[i] = 0;
+		}
+	}
+	/* FIXME: This assert is valid, but we do this during unit test :( */
+	/* assert(num_vals); */
+
+	/* Overwrite expanded bucket with subhash pointer. */
+	h->group[bucket] = subhash | (1ULL << NTDB_OFF_UPPER_STEAL_SUBHASH_BIT);
+
+	/* Point to actual contents of record. */
+	subhash += sizeof(struct ntdb_used_record);
+
+	/* Put values back. */
+	for (i = 0; i < num_vals; i++) {
+		unsigned this_bucket = vals[i] & NTDB_OFF_HASH_GROUP_MASK;
+
+		if (this_bucket == bucket) {
+			ecode = add_to_subhash(ntdb, subhash, h->hash_used,
+					       vals[i]);
+			if (ecode != NTDB_SUCCESS)
+				return ecode;
+		} else {
+			/* There should be room to put this back. */
+			force_into_group(h->group, this_bucket, vals[i]);
+		}
+	}
+	return NTDB_SUCCESS;
+}
+
+enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb, struct hash_info *h)
+{
+	unsigned int i, num_movers = 0;
+	ntdb_off_t movers[1 << NTDB_HASH_GROUP_BITS];
+
+	h->group[h->found_bucket] = 0;
+	for (i = 1; i < (1 << NTDB_HASH_GROUP_BITS); i++) {
+		unsigned this_bucket;
+
+		this_bucket = (h->found_bucket+i) % (1 << NTDB_HASH_GROUP_BITS);
+		/* Empty bucket?  We're done. */
+		if (!h->group[this_bucket])
+			break;
+
+		/* Ignore subhashes. */
+		if (is_subhash(h->group[this_bucket]))
+			continue;
+
+		/* If this one is not happy where it is, we'll move it. */
+		if ((h->group[this_bucket] & NTDB_OFF_HASH_GROUP_MASK)
+		    != this_bucket) {
+			movers[num_movers++] = h->group[this_bucket];
+			h->group[this_bucket] = 0;
+		}
+	}
+
+	/* Put back the ones we erased. */
+	for (i = 0; i < num_movers; i++) {
+		force_into_group(h->group, movers[i] & NTDB_OFF_HASH_GROUP_MASK,
+				 movers[i]);
+	}
+
+	/* Now we write back the hash group */
+	return ntdb_write_convert(ntdb, h->group_start,
+				 h->group, sizeof(h->group));
+}
+
+enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb, struct hash_info *h,
+			   ntdb_off_t new_off)
+{
+	enum NTDB_ERROR ecode;
+
+	/* We hit an empty bucket during search?  That's where it goes. */
+	if (!h->group[h->found_bucket]) {
+		h->group[h->found_bucket] = encode_offset(new_off, h);
+		/* Write back the modified group. */
+		return ntdb_write_convert(ntdb, h->group_start,
+					 h->group, sizeof(h->group));
+	}
+
+	if (h->hash_used > 64)
+		return add_to_chain(ntdb, h->group_start, new_off);
+
+	/* We're full.  Expand. */
+	ecode = expand_group(ntdb, h);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (is_subhash(h->group[h->home_bucket])) {
+		/* We were expanded! */
+		ntdb_off_t hashtable;
+		unsigned int gnum;
+
+		/* Write back the modified group. */
+		ecode = ntdb_write_convert(ntdb, h->group_start, h->group,
+					  sizeof(h->group));
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+
+		/* Move hashinfo down a level. */
+		hashtable = (h->group[h->home_bucket] & NTDB_OFF_MASK)
+			+ sizeof(struct ntdb_used_record);
+		gnum = use_bits(h,NTDB_SUBLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS);
+		h->home_bucket = use_bits(h, NTDB_HASH_GROUP_BITS);
+		h->group_start = hashtable
+			+ gnum * (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS);
+		ecode = ntdb_read_convert(ntdb, h->group_start, &h->group,
+					 sizeof(h->group));
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+	}
+
+	/* Expanding the group must have made room if it didn't choose this
+	 * bucket. */
+	if (put_into_group(h->group, h->home_bucket, encode_offset(new_off,h))){
+		return ntdb_write_convert(ntdb, h->group_start,
+					 h->group, sizeof(h->group));
+	}
+
+	/* This can happen if all hashes in group (and us) dropped into same
+	 * group in subhash. */
+	return add_to_hash(ntdb, h, new_off);
+}
+
+/* Traverse support: returns offset of record, or 0 or -ve error. */
+static ntdb_off_t iterate_hash(struct ntdb_context *ntdb,
+			      struct traverse_info *tinfo)
+{
+	ntdb_off_t off, val, i;
+	struct traverse_level *tlevel;
+
+	tlevel = &tinfo->levels[tinfo->num_levels-1];
+
+again:
+	for (i = ntdb_find_nonzero_off(ntdb, tlevel->hashtable,
+				      tlevel->entry, tlevel->total_buckets);
+	     i != tlevel->total_buckets;
+	     i = ntdb_find_nonzero_off(ntdb, tlevel->hashtable,
+				      i+1, tlevel->total_buckets)) {
+		if (NTDB_OFF_IS_ERR(i)) {
+			return i;
+		}
+
+		val = ntdb_read_off(ntdb, tlevel->hashtable+sizeof(ntdb_off_t)*i);
+		if (NTDB_OFF_IS_ERR(val)) {
+			return val;
+		}
+
+		off = val & NTDB_OFF_MASK;
+
+		/* This makes the delete-all-in-traverse case work
+		 * (and simplifies our logic a little). */
+		if (off == tinfo->prev)
+			continue;
+
+		tlevel->entry = i;
+
+		if (!is_subhash(val)) {
+			/* Found one. */
+			tinfo->prev = off;
+			return off;
+		}
+
+		/* When we come back, we want the next one */
+		tlevel->entry++;
+		tinfo->num_levels++;
+		tlevel++;
+		tlevel->hashtable = off + sizeof(struct ntdb_used_record);
+		tlevel->entry = 0;
+		/* Next level is a chain? */
+		if (unlikely(tinfo->num_levels == NTDB_MAX_LEVELS + 1))
+			tlevel->total_buckets = (1 << NTDB_HASH_GROUP_BITS);
+		else
+			tlevel->total_buckets = (1 << NTDB_SUBLEVEL_HASH_BITS);
+		goto again;
+	}
+
+	/* Nothing there? */
+	if (tinfo->num_levels == 1)
+		return 0;
+
+	/* Handle chained entries. */
+	if (unlikely(tinfo->num_levels == NTDB_MAX_LEVELS + 1)) {
+		tlevel->hashtable = ntdb_read_off(ntdb, tlevel->hashtable
+						 + offsetof(struct ntdb_chain,
+							    next));
+		if (NTDB_OFF_IS_ERR(tlevel->hashtable)) {
+			return tlevel->hashtable;
+		}
+		if (tlevel->hashtable) {
+			tlevel->hashtable += sizeof(struct ntdb_used_record);
+			tlevel->entry = 0;
+			goto again;
+		}
+	}
+
+	/* Go back up and keep searching. */
+	tinfo->num_levels--;
+	tlevel--;
+	goto again;
+}
+
+/* Return success if we find something, NTDB_ERR_NOEXIST if none. */
+enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
+			    struct traverse_info *tinfo,
+			    NTDB_DATA *kbuf, size_t *dlen)
+{
+	const unsigned group_bits = NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS;
+	ntdb_off_t hl_start, hl_range, off;
+	enum NTDB_ERROR ecode;
+
+	while (tinfo->toplevel_group < (1 << group_bits)) {
+		hl_start = (ntdb_off_t)tinfo->toplevel_group
+			<< (64 - group_bits);
+		hl_range = 1ULL << group_bits;
+		ecode = ntdb_lock_hashes(ntdb, hl_start, hl_range, F_RDLCK,
+					NTDB_LOCK_WAIT);
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+
+		off = iterate_hash(ntdb, tinfo);
+		if (off) {
+			struct ntdb_used_record rec;
+
+			if (NTDB_OFF_IS_ERR(off)) {
+				ecode = NTDB_OFF_TO_ERR(off);
+				goto fail;
+			}
+
+			ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+			if (ecode != NTDB_SUCCESS) {
+				goto fail;
+			}
+			if (rec_magic(&rec) != NTDB_USED_MAGIC) {
+				ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+						   NTDB_LOG_ERROR,
+						   "next_in_hash:"
+						   " corrupt record at %llu",
+						   (long long)off);
+				goto fail;
+			}
+
+			kbuf->dsize = rec_key_length(&rec);
+
+			/* They want data as well? */
+			if (dlen) {
+				*dlen = rec_data_length(&rec);
+				kbuf->dptr = ntdb_alloc_read(ntdb,
+							    off + sizeof(rec),
+							    kbuf->dsize
+							    + *dlen);
+			} else {
+				kbuf->dptr = ntdb_alloc_read(ntdb,
+							    off + sizeof(rec),
+							    kbuf->dsize);
+			}
+			ntdb_unlock_hashes(ntdb, hl_start, hl_range, F_RDLCK);
+			if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
+				return NTDB_PTR_ERR(kbuf->dptr);
+			}
+			return NTDB_SUCCESS;
+		}
+
+		ntdb_unlock_hashes(ntdb, hl_start, hl_range, F_RDLCK);
+
+		tinfo->toplevel_group++;
+		tinfo->levels[0].hashtable
+			+= (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS);
+		tinfo->levels[0].entry = 0;
+	}
+	return NTDB_ERR_NOEXIST;
+
+fail:
+	ntdb_unlock_hashes(ntdb, hl_start, hl_range, F_RDLCK);
+	return ecode;
+
+}
+
+enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
+			     struct traverse_info *tinfo,
+			     NTDB_DATA *kbuf, size_t *dlen)
+{
+	tinfo->prev = 0;
+	tinfo->toplevel_group = 0;
+	tinfo->num_levels = 1;
+	tinfo->levels[0].hashtable = offsetof(struct ntdb_header, hashtable);
+	tinfo->levels[0].entry = 0;
+	tinfo->levels[0].total_buckets = (1 << NTDB_HASH_GROUP_BITS);
+
+	return next_in_hash(ntdb, tinfo, kbuf, dlen);
+}
+
+/* Even if the entry isn't in this hash bucket, you'd have to lock this
+ * bucket to find it. */
+static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb, const NTDB_DATA *key,
+				int ltype, enum ntdb_lock_flags waitflag,
+				const char *func)
+{
+	enum NTDB_ERROR ecode;
+	uint64_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
+	ntdb_off_t lockstart, locksize;
+	unsigned int group, gbits;
+
+	gbits = NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS;
+	group = bits_from(h, 64 - gbits, gbits);
+
+	lockstart = hlock_range(group, &locksize);
+
+	ecode = ntdb_lock_hashes(ntdb, lockstart, locksize, ltype, waitflag);
+	ntdb_trace_1rec(ntdb, func, *key);
+	return ecode;
+}
+
+/* lock/unlock one hash chain. This is meant to be used to reduce
+   contention - it cannot guarantee how many records will be locked */
+_PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+	return ntdb->last_error = chainlock(ntdb, &key, F_WRLCK, NTDB_LOCK_WAIT,
+					   "ntdb_chainlock");
+}
+
+_PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+	uint64_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
+	ntdb_off_t lockstart, locksize;
+	unsigned int group, gbits;
+
+	gbits = NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS;
+	group = bits_from(h, 64 - gbits, gbits);
+
+	lockstart = hlock_range(group, &locksize);
+
+	ntdb_trace_1rec(ntdb, "ntdb_chainunlock", key);
+	ntdb_unlock_hashes(ntdb, lockstart, locksize, F_WRLCK);
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+	return ntdb->last_error = chainlock(ntdb, &key, F_RDLCK, NTDB_LOCK_WAIT,
+					   "ntdb_chainlock_read");
+}
+
+_PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+	uint64_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
+	ntdb_off_t lockstart, locksize;
+	unsigned int group, gbits;
+
+	gbits = NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS;
+	group = bits_from(h, 64 - gbits, gbits);
+
+	lockstart = hlock_range(group, &locksize);
+
+	ntdb_trace_1rec(ntdb, "ntdb_chainunlock_read", key);
+	ntdb_unlock_hashes(ntdb, lockstart, locksize, F_RDLCK);
+}
diff --git a/lib/ntdb/io.c b/lib/ntdb/io.c
new file mode 100644
index 0000000000..4580520fa2
--- /dev/null
+++ b/lib/ntdb/io.c
@@ -0,0 +1,650 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell		   2000
+   Copyright (C) Jeremy Allison			   2000-2003
+   Copyright (C) Rusty Russell			   2010
+
+     ** NOTE! The following LGPL license applies to the ntdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <assert.h>
+#include <ccan/likely/likely.h>
+
+void ntdb_munmap(struct ntdb_file *file)
+{
+	if (file->fd == -1)
+		return;
+
+	if (file->map_ptr) {
+		munmap(file->map_ptr, file->map_size);
+		file->map_ptr = NULL;
+	}
+}
+
+enum NTDB_ERROR ntdb_mmap(struct ntdb_context *ntdb)
+{
+	int mmap_flags;
+
+	if (ntdb->flags & NTDB_INTERNAL)
+		return NTDB_SUCCESS;
+
+#ifndef HAVE_INCOHERENT_MMAP
+	if (ntdb->flags & NTDB_NOMMAP)
+		return NTDB_SUCCESS;
+#endif
+
+	if ((ntdb->open_flags & O_ACCMODE) == O_RDONLY)
+		mmap_flags = PROT_READ;
+	else
+		mmap_flags = PROT_READ | PROT_WRITE;
+
+	/* size_t can be smaller than off_t. */
+	if ((size_t)ntdb->file->map_size == ntdb->file->map_size) {
+		ntdb->file->map_ptr = mmap(NULL, ntdb->file->map_size,
+					  mmap_flags,
+					  MAP_SHARED, ntdb->file->fd, 0);
+	} else
+		ntdb->file->map_ptr = MAP_FAILED;
+
+	/*
+	 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
+	 */
+	if (ntdb->file->map_ptr == MAP_FAILED) {
+		ntdb->file->map_ptr = NULL;
+#ifdef HAVE_INCOHERENT_MMAP
+		/* Incoherent mmap means everyone must mmap! */
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "ntdb_mmap failed for size %lld (%s)",
+				  (long long)ntdb->file->map_size,
+				  strerror(errno));
+#else
+		ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
+			   "ntdb_mmap failed for size %lld (%s)",
+			   (long long)ntdb->file->map_size, strerror(errno));
+#endif
+	}
+	return NTDB_SUCCESS;
+}
+
+/* check for an out of bounds access - if it is out of bounds then
+   see if the database has been expanded by someone else and expand
+   if necessary
+   note that "len" is the minimum length needed for the db.
+
+   If probe is true, len being too large isn't a failure.
+*/
+static enum NTDB_ERROR ntdb_oob(struct ntdb_context *ntdb,
+			      ntdb_off_t off, ntdb_len_t len, bool probe)
+{
+	struct stat st;
+	enum NTDB_ERROR ecode;
+
+	/* We can't hold pointers during this: we could unmap! */
+	assert(!ntdb->direct_access
+	       || (ntdb->flags & NTDB_NOLOCK)
+	       || ntdb_has_expansion_lock(ntdb));
+
+	if (len + off < len) {
+		if (probe)
+			return NTDB_SUCCESS;
+
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "ntdb_oob off %llu len %llu wrap\n",
+				  (long long)off, (long long)len);
+	}
+
+	if (len + off <= ntdb->file->map_size)
+		return NTDB_SUCCESS;
+	if (ntdb->flags & NTDB_INTERNAL) {
+		if (probe)
+			return NTDB_SUCCESS;
+
+		ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+			   "ntdb_oob len %lld beyond internal"
+			   " malloc size %lld",
+			   (long long)(off + len),
+			   (long long)ntdb->file->map_size);
+		return NTDB_ERR_IO;
+	}
+
+	ecode = ntdb_lock_expand(ntdb, F_RDLCK);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (fstat(ntdb->file->fd, &st) != 0) {
+		ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+			   "Failed to fstat file: %s", strerror(errno));
+		ntdb_unlock_expand(ntdb, F_RDLCK);
+		return NTDB_ERR_IO;
+	}
+
+	ntdb_unlock_expand(ntdb, F_RDLCK);
+
+	if (st.st_size < off + len) {
+		if (probe)
+			return NTDB_SUCCESS;
+
+		ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+			   "ntdb_oob len %llu beyond eof at %llu",
+			   (long long)(off + len), (long long)st.st_size);
+		return NTDB_ERR_IO;
+	}
+
+	/* Unmap, update size, remap */
+	ntdb_munmap(ntdb->file);
+
+	ntdb->file->map_size = st.st_size;
+	return ntdb_mmap(ntdb);
+}
+
+/* Endian conversion: we only ever deal with 8 byte quantities */
+void *ntdb_convert(const struct ntdb_context *ntdb, void *buf, ntdb_len_t size)
+{
+	assert(size % 8 == 0);
+	if (unlikely((ntdb->flags & NTDB_CONVERT)) && buf) {
+		uint64_t i, *p = (uint64_t *)buf;
+		for (i = 0; i < size / 8; i++)
+			p[i] = bswap_64(p[i]);
+	}
+	return buf;
+}
+
+/* Return first non-zero offset in offset array, or end, or -ve error. */
+/* FIXME: Return the off? */
+uint64_t ntdb_find_nonzero_off(struct ntdb_context *ntdb,
+			      ntdb_off_t base, uint64_t start, uint64_t end)
+{
+	uint64_t i;
+	const uint64_t *val;
+
+	/* Zero vs non-zero is the same unconverted: minor optimization. */
+	val = ntdb_access_read(ntdb, base + start * sizeof(ntdb_off_t),
+			      (end - start) * sizeof(ntdb_off_t), false);
+	if (NTDB_PTR_IS_ERR(val)) {
+		return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
+	}
+
+	for (i = 0; i < (end - start); i++) {
+		if (val[i])
+			break;
+	}
+	ntdb_access_release(ntdb, val);
+	return start + i;
+}
+
+/* Return first zero offset in num offset array, or num, or -ve error. */
+uint64_t ntdb_find_zero_off(struct ntdb_context *ntdb, ntdb_off_t off,
+			   uint64_t num)
+{
+	uint64_t i;
+	const uint64_t *val;
+
+	/* Zero vs non-zero is the same unconverted: minor optimization. */
+	val = ntdb_access_read(ntdb, off, num * sizeof(ntdb_off_t), false);
+	if (NTDB_PTR_IS_ERR(val)) {
+		return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
+	}
+
+	for (i = 0; i < num; i++) {
+		if (!val[i])
+			break;
+	}
+	ntdb_access_release(ntdb, val);
+	return i;
+}
+
+enum NTDB_ERROR zero_out(struct ntdb_context *ntdb, ntdb_off_t off, ntdb_len_t len)
+{
+	char buf[8192] = { 0 };
+	void *p = ntdb->io->direct(ntdb, off, len, true);
+	enum NTDB_ERROR ecode = NTDB_SUCCESS;
+
+	assert(!(ntdb->flags & NTDB_RDONLY));
+	if (NTDB_PTR_IS_ERR(p)) {
+		return NTDB_PTR_ERR(p);
+	}
+	if (p) {
+		memset(p, 0, len);
+		return ecode;
+	}
+	while (len) {
+		unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
+		ecode = ntdb->io->twrite(ntdb, off, buf, todo);
+		if (ecode != NTDB_SUCCESS) {
+			break;
+		}
+		len -= todo;
+		off += todo;
+	}
+	return ecode;
+}
+
+ntdb_off_t ntdb_read_off(struct ntdb_context *ntdb, ntdb_off_t off)
+{
+	ntdb_off_t ret;
+	enum NTDB_ERROR ecode;
+
+	if (likely(!(ntdb->flags & NTDB_CONVERT))) {
+		ntdb_off_t *p = ntdb->io->direct(ntdb, off, sizeof(*p), false);
+		if (NTDB_PTR_IS_ERR(p)) {
+			return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(p));
+		}
+		if (p)
+			return *p;
+	}
+
+	ecode = ntdb_read_convert(ntdb, off, &ret, sizeof(ret));
+	if (ecode != NTDB_SUCCESS) {
+		return NTDB_ERR_TO_OFF(ecode);
+	}
+	return ret;
+}
+
+/* write a lump of data at a specified offset */
+static enum NTDB_ERROR ntdb_write(struct ntdb_context *ntdb, ntdb_off_t off,
+				const void *buf, ntdb_len_t len)
+{
+	enum NTDB_ERROR ecode;
+
+	if (ntdb->flags & NTDB_RDONLY) {
+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+				  "Write to read-only database");
+	}
+
+	ecode = ntdb->io->oob(ntdb, off, len, false);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (ntdb->file->map_ptr) {
+		memcpy(off + (char *)ntdb->file->map_ptr, buf, len);
+	} else {
+#ifdef HAVE_INCOHERENT_MMAP
+		return NTDB_ERR_IO;
+#else
+		ssize_t ret;
+		ret = pwrite(ntdb->file->fd, buf, len, off);
+		if (ret != len) {
+			/* This shouldn't happen: we avoid sparse files. */
+			if (ret >= 0)
+				errno = ENOSPC;
+
+			return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+					  "ntdb_write: %zi at %zu len=%zu (%s)",
+					  ret, (size_t)off, (size_t)len,
+					  strerror(errno));
+		}
+#endif
+	}
+	return NTDB_SUCCESS;
+}
+
+/* read a lump of data at a specified offset */
+static enum NTDB_ERROR ntdb_read(struct ntdb_context *ntdb, ntdb_off_t off,
+			       void *buf, ntdb_len_t len)
+{
+	enum NTDB_ERROR ecode;
+
+	ecode = ntdb->io->oob(ntdb, off, len, false);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (ntdb->file->map_ptr) {
+		memcpy(buf, off + (char *)ntdb->file->map_ptr, len);
+	} else {
+#ifdef HAVE_INCOHERENT_MMAP
+		return NTDB_ERR_IO;
+#else
+		ssize_t r = pread(ntdb->file->fd, buf, len, off);
+		if (r != len) {
+			return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+					  "ntdb_read failed with %zi at %zu "
+					  "len=%zu (%s) map_size=%zu",
+					  r, (size_t)off, (size_t)len,
+					  strerror(errno),
+					  (size_t)ntdb->file->map_size);
+		}
+#endif
+	}
+	return NTDB_SUCCESS;
+}
+
+enum NTDB_ERROR ntdb_write_convert(struct ntdb_context *ntdb, ntdb_off_t off,
+				 const void *rec, size_t len)
+{
+	enum NTDB_ERROR ecode;
+
+	if (unlikely((ntdb->flags & NTDB_CONVERT))) {
+		void *conv = malloc(len);
+		if (!conv) {
+			return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+					  "ntdb_write: no memory converting"
+					  " %zu bytes", len);
+		}
+		memcpy(conv, rec, len);
+		ecode = ntdb->io->twrite(ntdb, off,
+					ntdb_convert(ntdb, conv, len), len);
+		free(conv);
+	} else {
+		ecode = ntdb->io->twrite(ntdb, off, rec, len);
+	}
+	return ecode;
+}
+
+enum NTDB_ERROR ntdb_read_convert(struct ntdb_context *ntdb, ntdb_off_t off,
+				void *rec, size_t len)
+{
+	enum NTDB_ERROR ecode = ntdb->io->tread(ntdb, off, rec, len);
+	ntdb_convert(ntdb, rec, len);
+	return ecode;
+}
+
+enum NTDB_ERROR ntdb_write_off(struct ntdb_context *ntdb,
+			     ntdb_off_t off, ntdb_off_t val)
+{
+	if (ntdb->flags & NTDB_RDONLY) {
+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+				  "Write to read-only database");
+	}
+
+	if (likely(!(ntdb->flags & NTDB_CONVERT))) {
+		ntdb_off_t *p = ntdb->io->direct(ntdb, off, sizeof(*p), true);
+		if (NTDB_PTR_IS_ERR(p)) {
+			return NTDB_PTR_ERR(p);
+		}
+		if (p) {
+			*p = val;
+			return NTDB_SUCCESS;
+		}
+	}
+	return ntdb_write_convert(ntdb, off, &val, sizeof(val));
+}
+
+static void *_ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset,
+			     ntdb_len_t len, unsigned int prefix)
+{
+	unsigned char *buf;
+	enum NTDB_ERROR ecode;
+
+	/* some systems don't like zero length malloc */
+	buf = malloc(prefix + len ? prefix + len : 1);
+	if (!buf) {
+		ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_USE_ERROR,
+			   "ntdb_alloc_read malloc failed len=%zu",
+			   (size_t)(prefix + len));
+		return NTDB_ERR_PTR(NTDB_ERR_OOM);
+	} else {
+		ecode = ntdb->io->tread(ntdb, offset, buf+prefix, len);
+		if (unlikely(ecode != NTDB_SUCCESS)) {
+			free(buf);
+			return NTDB_ERR_PTR(ecode);
+		}
+	}
+	return buf;
+}
+
+/* read a lump of data, allocating the space for it */
+void *ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset, ntdb_len_t len)
+{
+	return _ntdb_alloc_read(ntdb, offset, len, 0);
+}
+
+static enum NTDB_ERROR fill(struct ntdb_context *ntdb,
+			   const void *buf, size_t size,
+			   ntdb_off_t off, ntdb_len_t len)
+{
+	while (len) {
+		size_t n = len > size ? size : len;
+		ssize_t ret = pwrite(ntdb->file->fd, buf, n, off);
+		if (ret != n) {
+			if (ret >= 0)
+				errno = ENOSPC;
+
+			return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+					  "fill failed:"
+					  " %zi at %zu len=%zu (%s)",
+					  ret, (size_t)off, (size_t)len,
+					  strerror(errno));
+		}
+		len -= n;
+		off += n;
+	}
+	return NTDB_SUCCESS;
+}
+
+/* expand a file.  we prefer to use ftruncate, as that is what posix
+  says to use for mmap expansion */
+static enum NTDB_ERROR ntdb_expand_file(struct ntdb_context *ntdb,
+				      ntdb_len_t addition)
+{
+	char buf[8192];
+	enum NTDB_ERROR ecode;
+
+	if (ntdb->flags & NTDB_RDONLY) {
+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+				  "Expand on read-only database");
+	}
+
+	if (ntdb->flags & NTDB_INTERNAL) {
+		char *new = realloc(ntdb->file->map_ptr,
+				    ntdb->file->map_size + addition);
+		if (!new) {
+			return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+					  "No memory to expand database");
+		}
+		ntdb->file->map_ptr = new;
+		ntdb->file->map_size += addition;
+		return NTDB_SUCCESS;
+	} else {
+		/* Unmap before trying to write; old NTDB claimed OpenBSD had
+		 * problem with this otherwise. */
+		ntdb_munmap(ntdb->file);
+
+		/* If this fails, we try to fill anyway. */
+		if (ftruncate(ntdb->file->fd, ntdb->file->map_size + addition))
+			;
+
+		/* now fill the file with something. This ensures that the
+		   file isn't sparse, which would be very bad if we ran out of
+		   disk. This must be done with write, not via mmap */
+		memset(buf, 0x43, sizeof(buf));
+		ecode = fill(ntdb, buf, sizeof(buf), ntdb->file->map_size,
+			     addition);
+		if (ecode != NTDB_SUCCESS)
+			return ecode;
+		ntdb->file->map_size += addition;
+		return ntdb_mmap(ntdb);
+	}
+}
+
+const void *ntdb_access_read(struct ntdb_context *ntdb,
+			    ntdb_off_t off, ntdb_len_t len, bool convert)
+{
+	void *ret = NULL;
+
+	if (likely(!(ntdb->flags & NTDB_CONVERT))) {
+		ret = ntdb->io->direct(ntdb, off, len, false);
+
+		if (NTDB_PTR_IS_ERR(ret)) {
+			return ret;
+		}
+	}
+	if (!ret) {
+		struct ntdb_access_hdr *hdr;
+		hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
+		if (NTDB_PTR_IS_ERR(hdr)) {
+			return hdr;
+		}
+		hdr->next = ntdb->access;
+		ntdb->access = hdr;
+		ret = hdr + 1;
+		if (convert) {
+			ntdb_convert(ntdb, (void *)ret, len);
+		}
+	} else
+		ntdb->direct_access++;
+
+	return ret;
+}
+
+void *ntdb_access_write(struct ntdb_context *ntdb,
+		       ntdb_off_t off, ntdb_len_t len, bool convert)
+{
+	void *ret = NULL;
+
+	if (ntdb->flags & NTDB_RDONLY) {
+		ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+			   "Write to read-only database");
+		return NTDB_ERR_PTR(NTDB_ERR_RDONLY);
+	}
+
+	if (likely(!(ntdb->flags & NTDB_CONVERT))) {
+		ret = ntdb->io->direct(ntdb, off, len, true);
+
+		if (NTDB_PTR_IS_ERR(ret)) {
+			return ret;
+		}
+	}
+
+	if (!ret) {
+		struct ntdb_access_hdr *hdr;
+		hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
+		if (NTDB_PTR_IS_ERR(hdr)) {
+			return hdr;
+		}
+		hdr->next = ntdb->access;
+		ntdb->access = hdr;
+		hdr->off = off;
+		hdr->len = len;
+		hdr->convert = convert;
+		ret = hdr + 1;
+		if (convert)
+			ntdb_convert(ntdb, (void *)ret, len);
+	} else
+		ntdb->direct_access++;
+
+	return ret;
+}
+
+static struct ntdb_access_hdr **find_hdr(struct ntdb_context *ntdb, const void *p)
+{
+	struct ntdb_access_hdr **hp;
+
+	for (hp = &ntdb->access; *hp; hp = &(*hp)->next) {
+		if (*hp + 1 == p)
+			return hp;
+	}
+	return NULL;
+}
+
+void ntdb_access_release(struct ntdb_context *ntdb, const void *p)
+{
+	struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
+
+	if (hp) {
+		hdr = *hp;
+		*hp = hdr->next;
+		free(hdr);
+	} else
+		ntdb->direct_access--;
+}
+
+enum NTDB_ERROR ntdb_access_commit(struct ntdb_context *ntdb, void *p)
+{
+	struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
+	enum NTDB_ERROR ecode;
+
+	if (hp) {
+		hdr = *hp;
+		if (hdr->convert)
+			ecode = ntdb_write_convert(ntdb, hdr->off, p, hdr->len);
+		else
+			ecode = ntdb_write(ntdb, hdr->off, p, hdr->len);
+		*hp = hdr->next;
+		free(hdr);
+	} else {
+		ntdb->direct_access--;
+		ecode = NTDB_SUCCESS;
+	}
+
+	return ecode;
+}
+
+static void *ntdb_direct(struct ntdb_context *ntdb, ntdb_off_t off, size_t len,
+			bool write_mode)
+{
+	enum NTDB_ERROR ecode;
+
+	if (unlikely(!ntdb->file->map_ptr))
+		return NULL;
+
+	ecode = ntdb_oob(ntdb, off, len, false);
+	if (unlikely(ecode != NTDB_SUCCESS))
+		return NTDB_ERR_PTR(ecode);
+	return (char *)ntdb->file->map_ptr + off;
+}
+
+void ntdb_inc_seqnum(struct ntdb_context *ntdb)
+{
+	ntdb_off_t seq;
+
+	if (likely(!(ntdb->flags & NTDB_CONVERT))) {
+		int64_t *direct;
+
+		direct = ntdb->io->direct(ntdb,
+					 offsetof(struct ntdb_header, seqnum),
+					 sizeof(*direct), true);
+		if (likely(direct)) {
+			/* Don't let it go negative, even briefly */
+			if (unlikely((*direct) + 1) < 0)
+				*direct = 0;
+			(*direct)++;
+			return;
+		}
+	}
+
+	seq = ntdb_read_off(ntdb, offsetof(struct ntdb_header, seqnum));
+	if (!NTDB_OFF_IS_ERR(seq)) {
+		seq++;
+		if (unlikely((int64_t)seq < 0))
+			seq = 0;
+		ntdb_write_off(ntdb, offsetof(struct ntdb_header, seqnum), seq);
+	}
+}
+
+static const struct ntdb_methods io_methods = {
+	ntdb_read,
+	ntdb_write,
+	ntdb_oob,
+	ntdb_expand_file,
+	ntdb_direct,
+};
+
+/*
+  initialise the default methods table
+*/
+void ntdb_io_init(struct ntdb_context *ntdb)
+{
+	ntdb->io = &io_methods;
+}
diff --git a/lib/ntdb/lock.c b/lib/ntdb/lock.c
new file mode 100644
index 0000000000..167770d097
--- /dev/null
+++ b/lib/ntdb/lock.c
@@ -0,0 +1,883 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell		   2000
+   Copyright (C) Jeremy Allison			   2000-2003
+
+     ** NOTE! The following LGPL license applies to the ntdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "private.h"
+#include <assert.h>
+#include <ccan/build_assert/build_assert.h>
+
+/* If we were threaded, we could wait for unlock, but we're not, so fail. */
+enum NTDB_ERROR owner_conflict(struct ntdb_context *ntdb, const char *call)
+{
+	return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+			  "%s: lock owned by another ntdb in this process.",
+			  call);
+}
+
+/* If we fork, we no longer really own locks. */
+bool check_lock_pid(struct ntdb_context *ntdb, const char *call, bool log)
+{
+	/* No locks?  No problem! */
+	if (ntdb->file->allrecord_lock.count == 0
+	    && ntdb->file->num_lockrecs == 0) {
+		return true;
+	}
+
+	/* No fork?  No problem! */
+	if (ntdb->file->locker == getpid()) {
+		return true;
+	}
+
+	if (log) {
+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+			   "%s: fork() detected after lock acquisition!"
+			   " (%u vs %u)", call, ntdb->file->locker, getpid());
+	}
+	return false;
+}
+
+int ntdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
+		   void *unused)
+{
+	struct flock fl;
+	int ret;
+
+	do {
+		fl.l_type = rw;
+		fl.l_whence = SEEK_SET;
+		fl.l_start = off;
+		fl.l_len = len;
+
+		if (waitflag)
+			ret = fcntl(fd, F_SETLKW, &fl);
+		else
+			ret = fcntl(fd, F_SETLK, &fl);
+	} while (ret != 0 && errno == EINTR);
+	return ret;
+}
+
+int ntdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
+{
+	struct flock fl;
+	int ret;
+
+	do {
+		fl.l_type = F_UNLCK;
+		fl.l_whence = SEEK_SET;
+		fl.l_start = off;
+		fl.l_len = len;
+
+		ret = fcntl(fd, F_SETLKW, &fl);
+	} while (ret != 0 && errno == EINTR);
+	return ret;
+}
+
+static int lock(struct ntdb_context *ntdb,
+		      int rw, off_t off, off_t len, bool waitflag)
+{
+	int ret;
+	if (ntdb->file->allrecord_lock.count == 0
+	    && ntdb->file->num_lockrecs == 0) {
+		ntdb->file->locker = getpid();
+	}
+
+	ntdb->stats.lock_lowlevel++;
+	ret = ntdb->lock_fn(ntdb->file->fd, rw, off, len, waitflag,
+			   ntdb->lock_data);
+	if (!waitflag) {
+		ntdb->stats.lock_nonblock++;
+		if (ret != 0)
+			ntdb->stats.lock_nonblock_fail++;
+	}
+	return ret;
+}
+
+static int unlock(struct ntdb_context *ntdb, int rw, off_t off, off_t len)
+{
+#if 0 /* Check they matched up locks and unlocks correctly. */
+	char line[80];
+	FILE *locks;
+	bool found = false;
+
+	locks = fopen("/proc/locks", "r");
+
+	while (fgets(line, 80, locks)) {
+		char *p;
+		int type, start, l;
+
+		/* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
+		p = strchr(line, ':') + 1;
+		if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
+			continue;
+		p += strlen(" FLOCK  ADVISORY  ");
+		if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
+			type = F_RDLCK;
+		else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
+			type = F_WRLCK;
+		else
+			abort();
+		p += 6;
+		if (atoi(p) != getpid())
+			continue;
+		p = strchr(strchr(p, ' ') + 1, ' ') + 1;
+		start = atoi(p);
+		p = strchr(p, ' ') + 1;
+		if (strncmp(p, "EOF", 3) == 0)
+			l = 0;
+		else
+			l = atoi(p) - start + 1;
+
+		if (off == start) {
+			if (len != l) {
+				fprintf(stderr, "Len %u should be %u: %s",
+					(int)len, l, line);
+				abort();
+			}
+			if (type != rw) {
+				fprintf(stderr, "Type %s wrong: %s",
+					rw == F_RDLCK ? "READ" : "WRITE", line);
+				abort();
+			}
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		fprintf(stderr, "Unlock on %u@%u not found!",
+			(int)off, (int)len);
+		abort();
+	}
+
+	fclose(locks);
+#endif
+
+	return ntdb->unlock_fn(ntdb->file->fd, rw, off, len, ntdb->lock_data);
+}
+
+/* a byte range locking function - return 0 on success
+   this functions locks len bytes at the specified offset.
+
+   note that a len of zero means lock to end of file
+*/
+static enum NTDB_ERROR ntdb_brlock(struct ntdb_context *ntdb,
+				 int rw_type, ntdb_off_t offset, ntdb_off_t len,
+				 enum ntdb_lock_flags flags)
+{
+	int ret;
+
+	if (ntdb->flags & NTDB_NOLOCK) {
+		return NTDB_SUCCESS;
+	}
+
+	if (rw_type == F_WRLCK && (ntdb->flags & NTDB_RDONLY)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+				  "Write lock attempted on read-only database");
+	}
+
+	/* A 32 bit system cannot open a 64-bit file, but it could have
+	 * expanded since then: check here. */
+	if ((size_t)(offset + len) != offset + len) {
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "ntdb_brlock: lock on giant offset %llu",
+				  (long long)(offset + len));
+	}
+
+	ret = lock(ntdb, rw_type, offset, len, flags & NTDB_LOCK_WAIT);
+	if (ret != 0) {
+		/* Generic lock error. errno set by fcntl.
+		 * EAGAIN is an expected return from non-blocking
+		 * locks. */
+		if (!(flags & NTDB_LOCK_PROBE)
+		    && (errno != EAGAIN && errno != EINTR)) {
+			ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				   "ntdb_brlock failed (fd=%d) at"
+				   " offset %zu rw_type=%d flags=%d len=%zu:"
+				   " %s",
+				   ntdb->file->fd, (size_t)offset, rw_type,
+				   flags, (size_t)len, strerror(errno));
+		}
+		return NTDB_ERR_LOCK;
+	}
+	return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR ntdb_brunlock(struct ntdb_context *ntdb,
+				   int rw_type, ntdb_off_t offset, size_t len)
+{
+	if (ntdb->flags & NTDB_NOLOCK) {
+		return NTDB_SUCCESS;
+	}
+
+	if (!check_lock_pid(ntdb, "ntdb_brunlock", true))
+		return NTDB_ERR_LOCK;
+
+	if (unlock(ntdb, rw_type, offset, len) == -1) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_brunlock failed (fd=%d) at offset %zu"
+				  " rw_type=%d len=%zu: %s",
+				  ntdb->file->fd, (size_t)offset, rw_type,
+				  (size_t)len, strerror(errno));
+	}
+	return NTDB_SUCCESS;
+}
+
+/*
+  upgrade a read lock to a write lock. This needs to be handled in a
+  special way as some OSes (such as solaris) have too conservative
+  deadlock detection and claim a deadlock when progress can be
+  made. For those OSes we may loop for a while.
+*/
+enum NTDB_ERROR ntdb_allrecord_upgrade(struct ntdb_context *ntdb, off_t start)
+{
+	int count = 1000;
+
+	if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
+		return NTDB_ERR_LOCK;
+
+	if (ntdb->file->allrecord_lock.count != 1) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_allrecord_upgrade failed:"
+				  " count %u too high",
+				  ntdb->file->allrecord_lock.count);
+	}
+
+	if (ntdb->file->allrecord_lock.off != 1) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_allrecord_upgrade failed:"
+				  " already upgraded?");
+	}
+
+	if (ntdb->file->allrecord_lock.owner != ntdb) {
+		return owner_conflict(ntdb, "ntdb_allrecord_upgrade");
+	}
+
+	while (count--) {
+		struct timeval tv;
+		if (ntdb_brlock(ntdb, F_WRLCK, start, 0,
+			       NTDB_LOCK_WAIT|NTDB_LOCK_PROBE) == NTDB_SUCCESS) {
+			ntdb->file->allrecord_lock.ltype = F_WRLCK;
+			ntdb->file->allrecord_lock.off = 0;
+			return NTDB_SUCCESS;
+		}
+		if (errno != EDEADLK) {
+			break;
+		}
+		/* sleep for as short a time as we can - more portable than usleep() */
+		tv.tv_sec = 0;
+		tv.tv_usec = 1;
+		select(0, NULL, NULL, NULL, &tv);
+	}
+
+	if (errno != EAGAIN && errno != EINTR)
+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+			   "ntdb_allrecord_upgrade failed");
+	return NTDB_ERR_LOCK;
+}
+
+static struct ntdb_lock *find_nestlock(struct ntdb_context *ntdb, ntdb_off_t offset,
+				      const struct ntdb_context *owner)
+{
+	unsigned int i;
+
+	for (i=0; i<ntdb->file->num_lockrecs; i++) {
+		if (ntdb->file->lockrecs[i].off == offset) {
+			if (owner && ntdb->file->lockrecs[i].owner != owner)
+				return NULL;
+			return &ntdb->file->lockrecs[i];
+		}
+	}
+	return NULL;
+}
+
+enum NTDB_ERROR ntdb_lock_and_recover(struct ntdb_context *ntdb)
+{
+	enum NTDB_ERROR ecode;
+
+	if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
+		return NTDB_ERR_LOCK;
+
+	ecode = ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK,
+				   false);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	ecode = ntdb_lock_open(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
+	if (ecode != NTDB_SUCCESS) {
+		ntdb_allrecord_unlock(ntdb, F_WRLCK);
+		return ecode;
+	}
+	ecode = ntdb_transaction_recover(ntdb);
+	ntdb_unlock_open(ntdb, F_WRLCK);
+	ntdb_allrecord_unlock(ntdb, F_WRLCK);
+
+	return ecode;
+}
+
+/* lock an offset in the database. */
+static enum NTDB_ERROR ntdb_nest_lock(struct ntdb_context *ntdb,
+				    ntdb_off_t offset, int ltype,
+				    enum ntdb_lock_flags flags)
+{
+	struct ntdb_lock *new_lck;
+	enum NTDB_ERROR ecode;
+
+	if (offset > (NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE
+		      + ntdb->file->map_size / 8)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_nest_lock: invalid offset %zu ltype=%d",
+				  (size_t)offset, ltype);
+	}
+
+	if (ntdb->flags & NTDB_NOLOCK)
+		return NTDB_SUCCESS;
+
+	if (!check_lock_pid(ntdb, "ntdb_nest_lock", true)) {
+		return NTDB_ERR_LOCK;
+	}
+
+	ntdb->stats.locks++;
+
+	new_lck = find_nestlock(ntdb, offset, NULL);
+	if (new_lck) {
+		if (new_lck->owner != ntdb) {
+			return owner_conflict(ntdb, "ntdb_nest_lock");
+		}
+
+		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
+			return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+					  "ntdb_nest_lock:"
+					  " offset %zu has read lock",
+					  (size_t)offset);
+		}
+		/* Just increment the struct, posix locks don't stack. */
+		new_lck->count++;
+		return NTDB_SUCCESS;
+	}
+
+#if 0
+	if (ntdb->file->num_lockrecs
+	    && offset >= NTDB_HASH_LOCK_START
+	    && offset < NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_nest_lock: already have a hash lock?");
+	}
+#endif
+
+	new_lck = (struct ntdb_lock *)realloc(
+		ntdb->file->lockrecs,
+		sizeof(*ntdb->file->lockrecs) * (ntdb->file->num_lockrecs+1));
+	if (new_lck == NULL) {
+		return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+				  "ntdb_nest_lock:"
+				  " unable to allocate %zu lock struct",
+				  ntdb->file->num_lockrecs + 1);
+	}
+	ntdb->file->lockrecs = new_lck;
+
+	/* Since fcntl locks don't nest, we do a lock for the first one,
+	   and simply bump the count for future ones */
+	ecode = ntdb_brlock(ntdb, ltype, offset, 1, flags);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* First time we grab a lock, perhaps someone died in commit? */
+	if (!(flags & NTDB_LOCK_NOCHECK)
+	    && ntdb->file->num_lockrecs == 0) {
+		ntdb_bool_err berr = ntdb_needs_recovery(ntdb);
+		if (berr != false) {
+			ntdb_brunlock(ntdb, ltype, offset, 1);
+
+			if (berr < 0)
+				return NTDB_OFF_TO_ERR(berr);
+			ecode = ntdb_lock_and_recover(ntdb);
+			if (ecode == NTDB_SUCCESS) {
+				ecode = ntdb_brlock(ntdb, ltype, offset, 1,
+						   flags);
+			}
+			if (ecode != NTDB_SUCCESS) {
+				return ecode;
+			}
+		}
+	}
+
+	ntdb->file->lockrecs[ntdb->file->num_lockrecs].owner = ntdb;
+	ntdb->file->lockrecs[ntdb->file->num_lockrecs].off = offset;
+	ntdb->file->lockrecs[ntdb->file->num_lockrecs].count = 1;
+	ntdb->file->lockrecs[ntdb->file->num_lockrecs].ltype = ltype;
+	ntdb->file->num_lockrecs++;
+
+	return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR ntdb_nest_unlock(struct ntdb_context *ntdb,
+				      ntdb_off_t off, int ltype)
+{
+	struct ntdb_lock *lck;
+	enum NTDB_ERROR ecode;
+
+	if (ntdb->flags & NTDB_NOLOCK)
+		return NTDB_SUCCESS;
+
+	lck = find_nestlock(ntdb, off, ntdb);
+	if ((lck == NULL) || (lck->count == 0)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_nest_unlock: no lock for %zu",
+				  (size_t)off);
+	}
+
+	if (lck->count > 1) {
+		lck->count--;
+		return NTDB_SUCCESS;
+	}
+
+	/*
+	 * This lock has count==1 left, so we need to unlock it in the
+	 * kernel. We don't bother with decrementing the in-memory array
+	 * element, we're about to overwrite it with the last array element
+	 * anyway.
+	 */
+	ecode = ntdb_brunlock(ntdb, ltype, off, 1);
+
+	/*
+	 * Shrink the array by overwriting the element just unlocked with the
+	 * last array element.
+	 */
+	*lck = ntdb->file->lockrecs[--ntdb->file->num_lockrecs];
+
+	return ecode;
+}
+
+/*
+  get the transaction lock
+ */
+enum NTDB_ERROR ntdb_transaction_lock(struct ntdb_context *ntdb, int ltype)
+{
+	return ntdb_nest_lock(ntdb, NTDB_TRANSACTION_LOCK, ltype, NTDB_LOCK_WAIT);
+}
+
+/*
+  release the transaction lock
+ */
+void ntdb_transaction_unlock(struct ntdb_context *ntdb, int ltype)
+{
+	ntdb_nest_unlock(ntdb, NTDB_TRANSACTION_LOCK, ltype);
+}
+
+/* We only need to lock individual bytes, but Linux merges consecutive locks
+ * so we lock in contiguous ranges. */
+static enum NTDB_ERROR ntdb_lock_gradual(struct ntdb_context *ntdb,
+				       int ltype, enum ntdb_lock_flags flags,
+				       ntdb_off_t off, ntdb_off_t len)
+{
+	enum NTDB_ERROR ecode;
+	enum ntdb_lock_flags nb_flags = (flags & ~NTDB_LOCK_WAIT);
+
+	if (len <= 1) {
+		/* 0 would mean to end-of-file... */
+		assert(len != 0);
+		/* Single hash.  Just do blocking lock. */
+		return ntdb_brlock(ntdb, ltype, off, len, flags);
+	}
+
+	/* First we try non-blocking. */
+	ecode = ntdb_brlock(ntdb, ltype, off, len, nb_flags);
+	if (ecode != NTDB_ERR_LOCK) {
+		return ecode;
+	}
+
+	/* Try locking first half, then second. */
+	ecode = ntdb_lock_gradual(ntdb, ltype, flags, off, len / 2);
+	if (ecode != NTDB_SUCCESS)
+		return ecode;
+
+	ecode = ntdb_lock_gradual(ntdb, ltype, flags,
+				 off + len / 2, len - len / 2);
+	if (ecode != NTDB_SUCCESS) {
+		ntdb_brunlock(ntdb, ltype, off, len / 2);
+	}
+	return ecode;
+}
+
+/* lock/unlock entire database.  It can only be upgradable if you have some
+ * other way of guaranteeing exclusivity (ie. transaction write lock). */
+enum NTDB_ERROR ntdb_allrecord_lock(struct ntdb_context *ntdb, int ltype,
+				  enum ntdb_lock_flags flags, bool upgradable)
+{
+	enum NTDB_ERROR ecode;
+	ntdb_bool_err berr;
+
+	if (ntdb->flags & NTDB_NOLOCK)
+		return NTDB_SUCCESS;
+
+	if (!check_lock_pid(ntdb, "ntdb_allrecord_lock", true)) {
+		return NTDB_ERR_LOCK;
+	}
+
+	if (ntdb->file->allrecord_lock.count) {
+		if (ntdb->file->allrecord_lock.owner != ntdb) {
+			return owner_conflict(ntdb, "ntdb_allrecord_lock");
+		}
+
+		if (ltype == F_RDLCK
+		    || ntdb->file->allrecord_lock.ltype == F_WRLCK) {
+			ntdb->file->allrecord_lock.count++;
+			return NTDB_SUCCESS;
+		}
+
+		/* a global lock of a different type exists */
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+				  "ntdb_allrecord_lock: already have %s lock",
+				  ntdb->file->allrecord_lock.ltype == F_RDLCK
+				  ? "read" : "write");
+	}
+
+	if (ntdb_has_hash_locks(ntdb)) {
+		/* can't combine global and chain locks */
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+				  "ntdb_allrecord_lock:"
+				  " already have chain lock");
+	}
+
+	if (upgradable && ltype != F_RDLCK) {
+		/* ntdb error: you can't upgrade a write lock! */
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_allrecord_lock:"
+				  " can't upgrade a write lock");
+	}
+
+	ntdb->stats.locks++;
+again:
+	/* Lock hashes, gradually. */
+	ecode = ntdb_lock_gradual(ntdb, ltype, flags, NTDB_HASH_LOCK_START,
+				 NTDB_HASH_LOCK_RANGE);
+	if (ecode != NTDB_SUCCESS)
+		return ecode;
+
+	/* Lock free tables: there to end of file. */
+	ecode = ntdb_brlock(ntdb, ltype,
+			   NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE,
+			   0, flags);
+	if (ecode != NTDB_SUCCESS) {
+		ntdb_brunlock(ntdb, ltype, NTDB_HASH_LOCK_START,
+			     NTDB_HASH_LOCK_RANGE);
+		return ecode;
+	}
+
+	ntdb->file->allrecord_lock.owner = ntdb;
+	ntdb->file->allrecord_lock.count = 1;
+	/* If it's upgradable, it's actually exclusive so we can treat
+	 * it as a write lock. */
+	ntdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
+	ntdb->file->allrecord_lock.off = upgradable;
+
+	/* Now check for needing recovery. */
+	if (flags & NTDB_LOCK_NOCHECK)
+		return NTDB_SUCCESS;
+
+	berr = ntdb_needs_recovery(ntdb);
+	if (likely(berr == false))
+		return NTDB_SUCCESS;
+
+	ntdb_allrecord_unlock(ntdb, ltype);
+	if (berr < 0)
+		return NTDB_OFF_TO_ERR(berr);
+	ecode = ntdb_lock_and_recover(ntdb);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+	goto again;
+}
+
+enum NTDB_ERROR ntdb_lock_open(struct ntdb_context *ntdb,
+			     int ltype, enum ntdb_lock_flags flags)
+{
+	return ntdb_nest_lock(ntdb, NTDB_OPEN_LOCK, ltype, flags);
+}
+
+void ntdb_unlock_open(struct ntdb_context *ntdb, int ltype)
+{
+	ntdb_nest_unlock(ntdb, NTDB_OPEN_LOCK, ltype);
+}
+
+bool ntdb_has_open_lock(struct ntdb_context *ntdb)
+{
+	return !(ntdb->flags & NTDB_NOLOCK)
+		&& find_nestlock(ntdb, NTDB_OPEN_LOCK, ntdb) != NULL;
+}
+
+enum NTDB_ERROR ntdb_lock_expand(struct ntdb_context *ntdb, int ltype)
+{
+	/* Lock doesn't protect data, so don't check (we recurse if we do!) */
+	return ntdb_nest_lock(ntdb, NTDB_EXPANSION_LOCK, ltype,
+			     NTDB_LOCK_WAIT | NTDB_LOCK_NOCHECK);
+}
+
+void ntdb_unlock_expand(struct ntdb_context *ntdb, int ltype)
+{
+	ntdb_nest_unlock(ntdb, NTDB_EXPANSION_LOCK, ltype);
+}
+
+/* unlock entire db */
+void ntdb_allrecord_unlock(struct ntdb_context *ntdb, int ltype)
+{
+	if (ntdb->flags & NTDB_NOLOCK)
+		return;
+
+	if (ntdb->file->allrecord_lock.count == 0) {
+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+			   "ntdb_allrecord_unlock: not locked!");
+		return;
+	}
+
+	if (ntdb->file->allrecord_lock.owner != ntdb) {
+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+			   "ntdb_allrecord_unlock: not locked by us!");
+		return;
+	}
+
+	/* Upgradable locks are marked as write locks. */
+	if (ntdb->file->allrecord_lock.ltype != ltype
+	    && (!ntdb->file->allrecord_lock.off || ltype != F_RDLCK)) {
+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+			   "ntdb_allrecord_unlock: have %s lock",
+			   ntdb->file->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
+		return;
+	}
+
+	if (ntdb->file->allrecord_lock.count > 1) {
+		ntdb->file->allrecord_lock.count--;
+		return;
+	}
+
+	ntdb->file->allrecord_lock.count = 0;
+	ntdb->file->allrecord_lock.ltype = 0;
+
+	ntdb_brunlock(ntdb, ltype, NTDB_HASH_LOCK_START, 0);
+}
+
+bool ntdb_has_expansion_lock(struct ntdb_context *ntdb)
+{
+	return find_nestlock(ntdb, NTDB_EXPANSION_LOCK, ntdb) != NULL;
+}
+
+bool ntdb_has_hash_locks(struct ntdb_context *ntdb)
+{
+	unsigned int i;
+
+	for (i=0; i<ntdb->file->num_lockrecs; i++) {
+		if (ntdb->file->lockrecs[i].off >= NTDB_HASH_LOCK_START
+		    && ntdb->file->lockrecs[i].off < (NTDB_HASH_LOCK_START
+						     + NTDB_HASH_LOCK_RANGE))
+			return true;
+	}
+	return false;
+}
+
+static bool ntdb_has_free_lock(struct ntdb_context *ntdb)
+{
+	unsigned int i;
+
+	if (ntdb->flags & NTDB_NOLOCK)
+		return false;
+
+	for (i=0; i<ntdb->file->num_lockrecs; i++) {
+		if (ntdb->file->lockrecs[i].off
+		    > NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE)
+			return true;
+	}
+	return false;
+}
+
+enum NTDB_ERROR ntdb_lock_hashes(struct ntdb_context *ntdb,
+			       ntdb_off_t hash_lock,
+			       ntdb_len_t hash_range,
+			       int ltype, enum ntdb_lock_flags waitflag)
+{
+	/* FIXME: Do this properly, using hlock_range */
+	unsigned l = NTDB_HASH_LOCK_START
+		+ (hash_lock >> (64 - NTDB_HASH_LOCK_RANGE_BITS));
+
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (ntdb->file->allrecord_lock.count) {
+		if (!check_lock_pid(ntdb, "ntdb_lock_hashes", true))
+			return NTDB_ERR_LOCK;
+
+		if (ntdb->file->allrecord_lock.owner != ntdb)
+			return owner_conflict(ntdb, "ntdb_lock_hashes");
+		if (ltype == ntdb->file->allrecord_lock.ltype
+		    || ltype == F_RDLCK) {
+			return NTDB_SUCCESS;
+		}
+
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+				  "ntdb_lock_hashes:"
+				  " already have %s allrecordlock",
+				  ntdb->file->allrecord_lock.ltype == F_RDLCK
+				  ? "read" : "write");
+	}
+
+	if (ntdb_has_free_lock(ntdb)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_lock_hashes: already have free lock");
+	}
+
+	if (ntdb_has_expansion_lock(ntdb)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_lock_hashes:"
+				  " already have expansion lock");
+	}
+
+	return ntdb_nest_lock(ntdb, l, ltype, waitflag);
+}
+
+enum NTDB_ERROR ntdb_unlock_hashes(struct ntdb_context *ntdb,
+				 ntdb_off_t hash_lock,
+				 ntdb_len_t hash_range, int ltype)
+{
+	unsigned l = NTDB_HASH_LOCK_START
+		+ (hash_lock >> (64 - NTDB_HASH_LOCK_RANGE_BITS));
+
+	if (ntdb->flags & NTDB_NOLOCK)
+		return 0;
+
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (ntdb->file->allrecord_lock.count) {
+		if (ntdb->file->allrecord_lock.ltype == F_RDLCK
+		    && ltype == F_WRLCK) {
+			return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+					  "ntdb_unlock_hashes RO allrecord!");
+		}
+		if (ntdb->file->allrecord_lock.owner != ntdb) {
+			return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+					  "ntdb_unlock_hashes:"
+					  " not locked by us!");
+		}
+		return NTDB_SUCCESS;
+	}
+
+	return ntdb_nest_unlock(ntdb, l, ltype);
+}
+
+/* Hash locks use NTDB_HASH_LOCK_START + the next 30 bits.
+ * Then we begin; bucket offsets are sizeof(ntdb_len_t) apart, so we divide.
+ * The result is that on 32 bit systems we don't use lock values > 2^31 on
+ * files that are less than 4GB.
+ */
+static ntdb_off_t free_lock_off(ntdb_off_t b_off)
+{
+	return NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE
+		+ b_off / sizeof(ntdb_off_t);
+}
+
+enum NTDB_ERROR ntdb_lock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off,
+				    enum ntdb_lock_flags waitflag)
+{
+	assert(b_off >= sizeof(struct ntdb_header));
+
+	if (ntdb->flags & NTDB_NOLOCK)
+		return 0;
+
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (ntdb->file->allrecord_lock.count) {
+		if (!check_lock_pid(ntdb, "ntdb_lock_free_bucket", true))
+			return NTDB_ERR_LOCK;
+
+		if (ntdb->file->allrecord_lock.owner != ntdb) {
+			return owner_conflict(ntdb, "ntdb_lock_free_bucket");
+		}
+
+		if (ntdb->file->allrecord_lock.ltype == F_WRLCK)
+			return 0;
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_lock_free_bucket with"
+				  " read-only allrecordlock!");
+	}
+
+#if 0 /* FIXME */
+	if (ntdb_has_expansion_lock(ntdb)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+				  "ntdb_lock_free_bucket:"
+				  " already have expansion lock");
+	}
+#endif
+
+	return ntdb_nest_lock(ntdb, free_lock_off(b_off), F_WRLCK, waitflag);
+}
+
+void ntdb_unlock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off)
+{
+	if (ntdb->file->allrecord_lock.count)
+		return;
+
+	ntdb_nest_unlock(ntdb, free_lock_off(b_off), F_WRLCK);
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_lockall(struct ntdb_context *ntdb)
+{
+	return ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT, false);
+}
+
+_PUBLIC_ void ntdb_unlockall(struct ntdb_context *ntdb)
+{
+	ntdb_allrecord_unlock(ntdb, F_WRLCK);
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_lockall_read(struct ntdb_context *ntdb)
+{
+	return ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
+}
+
+_PUBLIC_ void ntdb_unlockall_read(struct ntdb_context *ntdb)
+{
+	ntdb_allrecord_unlock(ntdb, F_RDLCK);
+}
+
+void ntdb_lock_cleanup(struct ntdb_context *ntdb)
+{
+	unsigned int i;
+
+	/* We don't want to warn: they're allowed to close ntdb after fork. */
+	if (!check_lock_pid(ntdb, "ntdb_close", false))
+		return;
+
+	while (ntdb->file->allrecord_lock.count
+	       && ntdb->file->allrecord_lock.owner == ntdb) {
+		ntdb_allrecord_unlock(ntdb, ntdb->file->allrecord_lock.ltype);
+	}
+
+	for (i=0; i<ntdb->file->num_lockrecs; i++) {
+		if (ntdb->file->lockrecs[i].owner == ntdb) {
+			ntdb_nest_unlock(ntdb,
+					ntdb->file->lockrecs[i].off,
+					ntdb->file->lockrecs[i].ltype);
+			i--;
+		}
+	}
+}
diff --git a/lib/ntdb/ntdb.c b/lib/ntdb/ntdb.c
new file mode 100644
index 0000000000..9f1e32793a
--- /dev/null
+++ b/lib/ntdb/ntdb.c
@@ -0,0 +1,605 @@
+ /*
+   Trivial Database 2: fetch, store and misc routines.
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#ifndef HAVE_LIBREPLACE
+#include <ccan/asprintf/asprintf.h>
+#include <stdarg.h>
+#endif
+
+static enum NTDB_ERROR update_rec_hdr(struct ntdb_context *ntdb,
+				     ntdb_off_t off,
+				     ntdb_len_t keylen,
+				     ntdb_len_t datalen,
+				     struct ntdb_used_record *rec,
+				     uint64_t h)
+{
+	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
+	enum NTDB_ERROR ecode;
+
+	ecode = set_header(ntdb, rec, NTDB_USED_MAGIC, keylen, datalen,
+			   keylen + dataroom, h);
+	if (ecode == NTDB_SUCCESS) {
+		ecode = ntdb_write_convert(ntdb, off, rec, sizeof(*rec));
+	}
+	return ecode;
+}
+
+static enum NTDB_ERROR replace_data(struct ntdb_context *ntdb,
+				   struct hash_info *h,
+				   NTDB_DATA key, NTDB_DATA dbuf,
+				   ntdb_off_t old_off, ntdb_len_t old_room,
+				   bool growing)
+{
+	ntdb_off_t new_off;
+	enum NTDB_ERROR ecode;
+
+	/* Allocate a new record. */
+	new_off = alloc(ntdb, key.dsize, dbuf.dsize, h->h, NTDB_USED_MAGIC,
+			growing);
+	if (NTDB_OFF_IS_ERR(new_off)) {
+		return NTDB_OFF_TO_ERR(new_off);
+	}
+
+	/* We didn't like the existing one: remove it. */
+	if (old_off) {
+		ntdb->stats.frees++;
+		ecode = add_free_record(ntdb, old_off,
+					sizeof(struct ntdb_used_record)
+					+ key.dsize + old_room,
+					NTDB_LOCK_WAIT, true);
+		if (ecode == NTDB_SUCCESS)
+			ecode = replace_in_hash(ntdb, h, new_off);
+	} else {
+		ecode = add_to_hash(ntdb, h, new_off);
+	}
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	new_off += sizeof(struct ntdb_used_record);
+	ecode = ntdb->io->twrite(ntdb, new_off, key.dptr, key.dsize);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	new_off += key.dsize;
+	ecode = ntdb->io->twrite(ntdb, new_off, dbuf.dptr, dbuf.dsize);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	if (ntdb->flags & NTDB_SEQNUM)
+		ntdb_inc_seqnum(ntdb);
+
+	return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR update_data(struct ntdb_context *ntdb,
+				  ntdb_off_t off,
+				  NTDB_DATA dbuf,
+				  ntdb_len_t extra)
+{
+	enum NTDB_ERROR ecode;
+
+	ecode = ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize);
+	if (ecode == NTDB_SUCCESS && extra) {
+		/* Put a zero in; future versions may append other data. */
+		ecode = ntdb->io->twrite(ntdb, off + dbuf.dsize, "", 1);
+	}
+	if (ntdb->flags & NTDB_SEQNUM)
+		ntdb_inc_seqnum(ntdb);
+
+	return ecode;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_store(struct ntdb_context *ntdb,
+			 NTDB_DATA key, NTDB_DATA dbuf, int flag)
+{
+	struct hash_info h;
+	ntdb_off_t off;
+	ntdb_len_t old_room = 0;
+	struct ntdb_used_record rec;
+	enum NTDB_ERROR ecode;
+
+	off = find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL);
+	if (NTDB_OFF_IS_ERR(off)) {
+		return ntdb->last_error = NTDB_OFF_TO_ERR(off);
+	}
+
+	/* Now we have lock on this hash bucket. */
+	if (flag == NTDB_INSERT) {
+		if (off) {
+			ecode = NTDB_ERR_EXISTS;
+			goto out;
+		}
+	} else {
+		if (off) {
+			old_room = rec_data_length(&rec)
+				+ rec_extra_padding(&rec);
+			if (old_room >= dbuf.dsize) {
+				/* Can modify in-place.  Easy! */
+				ecode = update_rec_hdr(ntdb, off,
+						       key.dsize, dbuf.dsize,
+						       &rec, h.h);
+				if (ecode != NTDB_SUCCESS) {
+					goto out;
+				}
+				ecode = update_data(ntdb,
+						    off + sizeof(rec)
+						    + key.dsize, dbuf,
+						    old_room - dbuf.dsize);
+				if (ecode != NTDB_SUCCESS) {
+					goto out;
+				}
+				ntdb_unlock_hashes(ntdb, h.hlock_start,
+						  h.hlock_range, F_WRLCK);
+				return ntdb->last_error = NTDB_SUCCESS;
+			}
+		} else {
+			if (flag == NTDB_MODIFY) {
+				/* if the record doesn't exist and we
+				   are in NTDB_MODIFY mode then we should fail
+				   the store */
+				ecode = NTDB_ERR_NOEXIST;
+				goto out;
+			}
+		}
+	}
+
+	/* If we didn't use the old record, this implies we're growing. */
+	ecode = replace_data(ntdb, &h, key, dbuf, off, old_room, off);
+out:
+	ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_WRLCK);
+	return ntdb->last_error = ecode;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_append(struct ntdb_context *ntdb,
+			  NTDB_DATA key, NTDB_DATA dbuf)
+{
+	struct hash_info h;
+	ntdb_off_t off;
+	struct ntdb_used_record rec;
+	ntdb_len_t old_room = 0, old_dlen;
+	unsigned char *newdata;
+	NTDB_DATA new_dbuf;
+	enum NTDB_ERROR ecode;
+
+	off = find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL);
+	if (NTDB_OFF_IS_ERR(off)) {
+		return ntdb->last_error = NTDB_OFF_TO_ERR(off);
+	}
+
+	if (off) {
+		old_dlen = rec_data_length(&rec);
+		old_room = old_dlen + rec_extra_padding(&rec);
+
+		/* Fast path: can append in place. */
+		if (rec_extra_padding(&rec) >= dbuf.dsize) {
+			ecode = update_rec_hdr(ntdb, off, key.dsize,
+					       old_dlen + dbuf.dsize, &rec,
+					       h.h);
+			if (ecode != NTDB_SUCCESS) {
+				goto out;
+			}
+
+			off += sizeof(rec) + key.dsize + old_dlen;
+			ecode = update_data(ntdb, off, dbuf,
+					    rec_extra_padding(&rec));
+			goto out;
+		}
+
+		/* Slow path. */
+		newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
+		if (!newdata) {
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+					   "ntdb_append:"
+					   " failed to allocate %zu bytes",
+					   (size_t)(key.dsize + old_dlen
+						    + dbuf.dsize));
+			goto out;
+		}
+		ecode = ntdb->io->tread(ntdb, off + sizeof(rec) + key.dsize,
+				       newdata, old_dlen);
+		if (ecode != NTDB_SUCCESS) {
+			goto out_free_newdata;
+		}
+		memcpy(newdata + old_dlen, dbuf.dptr, dbuf.dsize);
+		new_dbuf.dptr = newdata;
+		new_dbuf.dsize = old_dlen + dbuf.dsize;
+	} else {
+		newdata = NULL;
+		new_dbuf = dbuf;
+	}
+
+	/* If they're using ntdb_append(), it implies they're growing record. */
+	ecode = replace_data(ntdb, &h, key, new_dbuf, off, old_room, true);
+
+out_free_newdata:
+	free(newdata);
+out:
+	ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_WRLCK);
+	return ntdb->last_error = ecode;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_fetch(struct ntdb_context *ntdb, NTDB_DATA key,
+			 NTDB_DATA *data)
+{
+	ntdb_off_t off;
+	struct ntdb_used_record rec;
+	struct hash_info h;
+	enum NTDB_ERROR ecode;
+
+	off = find_and_lock(ntdb, key, F_RDLCK, &h, &rec, NULL);
+	if (NTDB_OFF_IS_ERR(off)) {
+		return ntdb->last_error = NTDB_OFF_TO_ERR(off);
+	}
+
+	if (!off) {
+		ecode = NTDB_ERR_NOEXIST;
+	} else {
+		data->dsize = rec_data_length(&rec);
+		data->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec) + key.dsize,
+					    data->dsize);
+		if (NTDB_PTR_IS_ERR(data->dptr)) {
+			ecode = NTDB_PTR_ERR(data->dptr);
+		} else
+			ecode = NTDB_SUCCESS;
+	}
+
+	ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_RDLCK);
+	return ntdb->last_error = ecode;
+}
+
+_PUBLIC_ bool ntdb_exists(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+	ntdb_off_t off;
+	struct ntdb_used_record rec;
+	struct hash_info h;
+
+	off = find_and_lock(ntdb, key, F_RDLCK, &h, &rec, NULL);
+	if (NTDB_OFF_IS_ERR(off)) {
+		ntdb->last_error = NTDB_OFF_TO_ERR(off);
+		return false;
+	}
+	ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+	ntdb->last_error = NTDB_SUCCESS;
+	return off ? true : false;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_delete(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+	ntdb_off_t off;
+	struct ntdb_used_record rec;
+	struct hash_info h;
+	enum NTDB_ERROR ecode;
+
+	off = find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL);
+	if (NTDB_OFF_IS_ERR(off)) {
+		return ntdb->last_error = NTDB_OFF_TO_ERR(off);
+	}
+
+	if (!off) {
+		ecode = NTDB_ERR_NOEXIST;
+		goto unlock;
+	}
+
+	ecode = delete_from_hash(ntdb, &h);
+	if (ecode != NTDB_SUCCESS) {
+		goto unlock;
+	}
+
+	/* Free the deleted entry. */
+	ntdb->stats.frees++;
+	ecode = add_free_record(ntdb, off,
+				sizeof(struct ntdb_used_record)
+				+ rec_key_length(&rec)
+				+ rec_data_length(&rec)
+				+ rec_extra_padding(&rec),
+				NTDB_LOCK_WAIT, true);
+
+	if (ntdb->flags & NTDB_SEQNUM)
+		ntdb_inc_seqnum(ntdb);
+
+unlock:
+	ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_WRLCK);
+	return ntdb->last_error = ecode;
+}
+
+_PUBLIC_ unsigned int ntdb_get_flags(struct ntdb_context *ntdb)
+{
+	return ntdb->flags;
+}
+
+static bool inside_transaction(const struct ntdb_context *ntdb)
+{
+	return ntdb->transaction != NULL;
+}
+
+static bool readonly_changable(struct ntdb_context *ntdb, const char *caller)
+{
+	if (inside_transaction(ntdb)) {
+		ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+					     NTDB_LOG_USE_ERROR,
+					     "%s: can't change"
+					     " NTDB_RDONLY inside transaction",
+					     caller);
+		return false;
+	}
+	return true;
+}
+
+_PUBLIC_ void ntdb_add_flag(struct ntdb_context *ntdb, unsigned flag)
+{
+	if (ntdb->flags & NTDB_INTERNAL) {
+		ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+					     NTDB_LOG_USE_ERROR,
+					     "ntdb_add_flag: internal db");
+		return;
+	}
+	switch (flag) {
+	case NTDB_NOLOCK:
+		ntdb->flags |= NTDB_NOLOCK;
+		break;
+	case NTDB_NOMMAP:
+		ntdb->flags |= NTDB_NOMMAP;
+#ifndef HAVE_INCOHERENT_MMAP
+		ntdb_munmap(ntdb->file);
+#endif
+		break;
+	case NTDB_NOSYNC:
+		ntdb->flags |= NTDB_NOSYNC;
+		break;
+	case NTDB_SEQNUM:
+		ntdb->flags |= NTDB_SEQNUM;
+		break;
+	case NTDB_ALLOW_NESTING:
+		ntdb->flags |= NTDB_ALLOW_NESTING;
+		break;
+	case NTDB_RDONLY:
+		if (readonly_changable(ntdb, "ntdb_add_flag"))
+			ntdb->flags |= NTDB_RDONLY;
+		break;
+	default:
+		ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+					     NTDB_LOG_USE_ERROR,
+					     "ntdb_add_flag: Unknown flag %u",
+					     flag);
+	}
+}
+
+_PUBLIC_ void ntdb_remove_flag(struct ntdb_context *ntdb, unsigned flag)
+{
+	if (ntdb->flags & NTDB_INTERNAL) {
+		ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+					     NTDB_LOG_USE_ERROR,
+					     "ntdb_remove_flag: internal db");
+		return;
+	}
+	switch (flag) {
+	case NTDB_NOLOCK:
+		ntdb->flags &= ~NTDB_NOLOCK;
+		break;
+	case NTDB_NOMMAP:
+		ntdb->flags &= ~NTDB_NOMMAP;
+#ifndef HAVE_INCOHERENT_MMAP
+		/* If mmap incoherent, we were mmaping anyway. */
+		ntdb_mmap(ntdb);
+#endif
+		break;
+	case NTDB_NOSYNC:
+		ntdb->flags &= ~NTDB_NOSYNC;
+		break;
+	case NTDB_SEQNUM:
+		ntdb->flags &= ~NTDB_SEQNUM;
+		break;
+	case NTDB_ALLOW_NESTING:
+		ntdb->flags &= ~NTDB_ALLOW_NESTING;
+		break;
+	case NTDB_RDONLY:
+		if ((ntdb->open_flags & O_ACCMODE) == O_RDONLY) {
+			ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+						     NTDB_LOG_USE_ERROR,
+						     "ntdb_remove_flag: can't"
+						     " remove NTDB_RDONLY on ntdb"
+						     " opened with O_RDONLY");
+			break;
+		}
+		if (readonly_changable(ntdb, "ntdb_remove_flag"))
+			ntdb->flags &= ~NTDB_RDONLY;
+		break;
+	default:
+		ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+					     NTDB_LOG_USE_ERROR,
+					     "ntdb_remove_flag: Unknown flag %u",
+					     flag);
+	}
+}
+
+_PUBLIC_ const char *ntdb_errorstr(enum NTDB_ERROR ecode)
+{
+	/* Gcc warns if you miss a case in the switch, so use that. */
+	switch (NTDB_ERR_TO_OFF(ecode)) {
+	case NTDB_ERR_TO_OFF(NTDB_SUCCESS): return "Success";
+	case NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT): return "Corrupt database";
+	case NTDB_ERR_TO_OFF(NTDB_ERR_IO): return "IO Error";
+	case NTDB_ERR_TO_OFF(NTDB_ERR_LOCK): return "Locking error";
+	case NTDB_ERR_TO_OFF(NTDB_ERR_OOM): return "Out of memory";
+	case NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS): return "Record exists";
+	case NTDB_ERR_TO_OFF(NTDB_ERR_EINVAL): return "Invalid parameter";
+	case NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST): return "Record does not exist";
+	case NTDB_ERR_TO_OFF(NTDB_ERR_RDONLY): return "write not permitted";
+	}
+	return "Invalid error code";
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_error(struct ntdb_context *ntdb)
+{
+	return ntdb->last_error;
+}
+
+enum NTDB_ERROR COLD ntdb_logerr(struct ntdb_context *ntdb,
+			       enum NTDB_ERROR ecode,
+			       enum ntdb_log_level level,
+			       const char *fmt, ...)
+{
+	char *message;
+	va_list ap;
+	size_t len;
+	/* ntdb_open paths care about errno, so save it. */
+	int saved_errno = errno;
+
+	if (!ntdb->log_fn)
+		return ecode;
+
+	va_start(ap, fmt);
+	len = vasprintf(&message, fmt, ap);
+	va_end(ap);
+
+	if (len < 0) {
+		ntdb->log_fn(ntdb, NTDB_LOG_ERROR, NTDB_ERR_OOM,
+			    "out of memory formatting message:", ntdb->log_data);
+		ntdb->log_fn(ntdb, level, ecode, fmt, ntdb->log_data);
+	} else {
+		ntdb->log_fn(ntdb, level, ecode, message, ntdb->log_data);
+		free(message);
+	}
+	errno = saved_errno;
+	return ecode;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_parse_record_(struct ntdb_context *ntdb,
+				 NTDB_DATA key,
+				 enum NTDB_ERROR (*parse)(NTDB_DATA k,
+							 NTDB_DATA d,
+							 void *data),
+				 void *data)
+{
+	ntdb_off_t off;
+	struct ntdb_used_record rec;
+	struct hash_info h;
+	enum NTDB_ERROR ecode;
+
+	off = find_and_lock(ntdb, key, F_RDLCK, &h, &rec, NULL);
+	if (NTDB_OFF_IS_ERR(off)) {
+		return ntdb->last_error = NTDB_OFF_TO_ERR(off);
+	}
+
+	if (!off) {
+		ecode = NTDB_ERR_NOEXIST;
+	} else {
+		const void *dptr;
+		dptr = ntdb_access_read(ntdb, off + sizeof(rec) + key.dsize,
+				       rec_data_length(&rec), false);
+		if (NTDB_PTR_IS_ERR(dptr)) {
+			ecode = NTDB_PTR_ERR(dptr);
+		} else {
+			NTDB_DATA d = ntdb_mkdata(dptr, rec_data_length(&rec));
+
+			ecode = parse(key, d, data);
+			ntdb_access_release(ntdb, dptr);
+		}
+	}
+
+	ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_RDLCK);
+	return ntdb->last_error = ecode;
+}
+
+_PUBLIC_ const char *ntdb_name(const struct ntdb_context *ntdb)
+{
+	return ntdb->name;
+}
+
+_PUBLIC_ int64_t ntdb_get_seqnum(struct ntdb_context *ntdb)
+{
+	ntdb_off_t off;
+
+	off = ntdb_read_off(ntdb, offsetof(struct ntdb_header, seqnum));
+	if (NTDB_OFF_IS_ERR(off))
+		ntdb->last_error = NTDB_OFF_TO_ERR(off);
+	else
+		ntdb->last_error = NTDB_SUCCESS;
+	return off;
+}
+
+
+_PUBLIC_ int ntdb_fd(const struct ntdb_context *ntdb)
+{
+	return ntdb->file->fd;
+}
+
+struct traverse_state {
+	enum NTDB_ERROR error;
+	struct ntdb_context *dest_db;
+};
+
+/*
+  traverse function for repacking
+ */
+static int repack_traverse(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA data,
+			   struct traverse_state *state)
+{
+	state->error = ntdb_store(state->dest_db, key, data, NTDB_INSERT);
+	if (state->error != NTDB_SUCCESS) {
+		return -1;
+	}
+	return 0;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_repack(struct ntdb_context *ntdb)
+{
+	struct ntdb_context *tmp_db;
+	struct traverse_state state;
+
+	state.error = ntdb_transaction_start(ntdb);
+	if (state.error != NTDB_SUCCESS) {
+		return state.error;
+	}
+
+	tmp_db = ntdb_open("tmpdb", NTDB_INTERNAL, O_RDWR|O_CREAT, 0, NULL);
+	if (tmp_db == NULL) {
+		state.error = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+					 __location__
+					 " Failed to create tmp_db");
+		ntdb_transaction_cancel(ntdb);
+		return ntdb->last_error = state.error;
+	}
+
+	state.dest_db = tmp_db;
+	if (ntdb_traverse(ntdb, repack_traverse, &state) < 0) {
+		goto fail;
+	}
+
+	state.error = ntdb_wipe_all(ntdb);
+	if (state.error != NTDB_SUCCESS) {
+		goto fail;
+	}
+
+	state.dest_db = ntdb;
+	if (ntdb_traverse(tmp_db, repack_traverse, &state) < 0) {
+		goto fail;
+	}
+
+	ntdb_close(tmp_db);
+	return ntdb_transaction_commit(ntdb);
+
+fail:
+	ntdb_transaction_cancel(ntdb);
+	ntdb_close(tmp_db);
+	return state.error;
+}
diff --git a/lib/ntdb/ntdb.h b/lib/ntdb/ntdb.h
new file mode 100644
index 0000000000..f0833b7261
--- /dev/null
+++ b/lib/ntdb/ntdb.h
@@ -0,0 +1,901 @@
+#ifndef CCAN_NTDB_H
+#define CCAN_NTDB_H
+
+/*
+   NTDB: trivial database library version 2
+
+   Copyright (C) Andrew Tridgell 1999-2004
+   Copyright (C) Rusty Russell 2010-2012
+
+     ** NOTE! The following LGPL license applies to the ntdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_LIBREPLACE
+#include <replace.h>
+#else
+#if HAVE_FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64
+#endif
+/* For mode_t */
+#include <sys/types.h>
+/* For O_* flags. */
+#include <sys/stat.h>
+/* For sig_atomic_t. */
+#include <signal.h>
+/* For uint64_t */
+#include <stdint.h>
+/* For bool */
+#include <stdbool.h>
+/* For memcmp */
+#include <string.h>
+#endif
+
+#if HAVE_CCAN
+#include <ccan/compiler/compiler.h>
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <ccan/cast/cast.h>
+#else
+#ifndef typesafe_cb_preargs
+/* Failing to have CCAN just mean less typesafe protection, etc. */
+#define typesafe_cb_preargs(rtype, atype, fn, arg, ...)	\
+	((rtype (*)(__VA_ARGS__, atype))(fn))
+#endif
+#ifndef cast_const
+#if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
+#define cast_const(type, expr) ((type)((intptr_t)(expr)))
+#else
+#define cast_const(type, expr) ((type *)(expr))
+#endif
+#endif
+#endif /* !HAVE_CCAN */
+
+union ntdb_attribute;
+struct ntdb_context;
+
+/**
+ * struct TDB_DATA - (n)tdb data blob
+ *
+ * To ease compatibility, we use 'struct TDB_DATA' from tdb.h, so if
+ * you want to include both tdb.h and ntdb.h, you need to #include
+ * tdb.h first.
+ */
+#ifndef __TDB_H__
+struct TDB_DATA {
+	unsigned char *dptr;
+	size_t dsize;
+};
+#endif
+
+typedef struct TDB_DATA NTDB_DATA;
+
+/**
+ * ntdb_open - open a database file
+ * @name: the file name (can be NULL if flags contains NTDB_INTERNAL)
+ * @ntdb_flags: options for this database
+ * @open_flags: flags argument for ntdb's open() call.
+ * @mode: mode argument for ntdb's open() call.
+ * @attributes: linked list of extra attributes for this ntdb.
+ *
+ * This call opens (and potentially creates) a database file.
+ * Multiple processes can have the NTDB file open at once.
+ *
+ * On failure it will return NULL, and set errno: it may also call
+ * any log attribute found in @attributes.
+ *
+ * See also:
+ *	union ntdb_attribute
+ */
+struct ntdb_context *ntdb_open(const char *name, int ntdb_flags,
+			       int open_flags, mode_t mode,
+			       union ntdb_attribute *attributes);
+
+
+/* flags for ntdb_open() */
+#define NTDB_DEFAULT 0 /* just a readability place holder */
+#define NTDB_INTERNAL 2 /* don't store on disk */
+#define NTDB_NOLOCK   4 /* don't do any locking */
+#define NTDB_NOMMAP   8 /* don't use mmap */
+#define NTDB_CONVERT 16 /* convert endian */
+#define NTDB_NOSYNC   64 /* don't use synchronous transactions */
+#define NTDB_SEQNUM   128 /* maintain a sequence number */
+#define NTDB_ALLOW_NESTING   256 /* fake nested transactions */
+#define NTDB_RDONLY   512 /* implied by O_RDONLY */
+#define NTDB_CANT_CHECK  2048 /* has a feature which we don't understand */
+
+/**
+ * ntdb_close - close and free a ntdb.
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This always succeeds, in that @ntdb is unusable after this call.  But if
+ * some unexpected error occurred while closing, it will return non-zero
+ * (the only clue as to cause will be via the log attribute).
+ */
+int ntdb_close(struct ntdb_context *ntdb);
+
+/**
+ * enum NTDB_ERROR - error returns for NTDB
+ *
+ * See Also:
+ *	ntdb_errorstr()
+ */
+enum NTDB_ERROR {
+	NTDB_SUCCESS	= 0,	/* No error. */
+	NTDB_ERR_CORRUPT = -1,	/* We read the db, and it was bogus. */
+	NTDB_ERR_IO	= -2,	/* We couldn't read/write the db. */
+	NTDB_ERR_LOCK	= -3,	/* Locking failed. */
+	NTDB_ERR_OOM	= -4,	/* Out of Memory. */
+	NTDB_ERR_EXISTS	= -5,	/* The key already exists. */
+	NTDB_ERR_NOEXIST	= -6,	/* The key does not exist. */
+	NTDB_ERR_EINVAL	= -7,	/* You're using it wrong. */
+	NTDB_ERR_RDONLY	= -8,	/* The database is read-only. */
+	NTDB_ERR_LAST = NTDB_ERR_RDONLY
+};
+
+/**
+ * ntdb_store - store a key/value pair in a ntdb.
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key
+ * @dbuf: the data to associate with the key.
+ * @flag: NTDB_REPLACE, NTDB_INSERT or NTDB_MODIFY.
+ *
+ * This inserts (or overwrites) a key/value pair in the NTDB.  If flag
+ * is NTDB_REPLACE, it doesn't matter whether the key exists or not;
+ * NTDB_INSERT means it must not exist (returns NTDB_ERR_EXISTS otherwise),
+ * and NTDB_MODIFY means it must exist (returns NTDB_ERR_NOEXIST otherwise).
+ *
+ * On success, this returns NTDB_SUCCESS.
+ *
+ * See also:
+ *	ntdb_fetch, ntdb_transaction_start, ntdb_append, ntdb_delete.
+ */
+enum NTDB_ERROR ntdb_store(struct ntdb_context *ntdb,
+			   NTDB_DATA key,
+			   NTDB_DATA dbuf,
+			   int flag);
+
+/* flags to ntdb_store() */
+#define NTDB_REPLACE 1		/* A readability place holder */
+#define NTDB_INSERT 2 		/* Don't overwrite an existing entry */
+#define NTDB_MODIFY 3		/* Don't create an existing entry    */
+
+/**
+ * ntdb_fetch - fetch a value from a ntdb.
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key
+ * @data: pointer to data.
+ *
+ * This looks up a key in the database and sets it in @data.
+ *
+ * If it returns NTDB_SUCCESS, the key was found: it is your
+ * responsibility to call free() on @data->dptr.
+ *
+ * Otherwise, it returns an error (usually, NTDB_ERR_NOEXIST) and @data is
+ * undefined.
+ */
+enum NTDB_ERROR ntdb_fetch(struct ntdb_context *ntdb, NTDB_DATA key,
+			   NTDB_DATA *data);
+
+/**
+ * ntdb_errorstr - map the ntdb error onto a constant readable string
+ * @ecode: the enum NTDB_ERROR to map.
+ *
+ * This is useful for displaying errors to users.
+ */
+const char *ntdb_errorstr(enum NTDB_ERROR ecode);
+
+/**
+ * ntdb_append - append a value to a key/value pair in a ntdb.
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key
+ * @dbuf: the data to append.
+ *
+ * This is equivalent to fetching a record, reallocating .dptr to add the
+ * data, and writing it back, only it's much more efficient.  If the key
+ * doesn't exist, it's equivalent to ntdb_store (with an additional hint that
+ * you expect to expand the record in future).
+ *
+ * See Also:
+ *	ntdb_fetch(), ntdb_store()
+ */
+enum NTDB_ERROR ntdb_append(struct ntdb_context *ntdb,
+			    NTDB_DATA key, NTDB_DATA dbuf);
+
+/**
+ * ntdb_delete - delete a key from a ntdb.
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key to delete.
+ *
+ * Returns NTDB_SUCCESS on success, or an error (usually NTDB_ERR_NOEXIST).
+ *
+ * See Also:
+ *	ntdb_fetch(), ntdb_store()
+ */
+enum NTDB_ERROR ntdb_delete(struct ntdb_context *ntdb, NTDB_DATA key);
+
+/**
+ * ntdb_exists - does a key exist in the database?
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key to search for.
+ *
+ * Returns true if it exists, or false if it doesn't or any other error.
+ */
+bool ntdb_exists(struct ntdb_context *ntdb, NTDB_DATA key);
+
+/**
+ * ntdb_deq - are NTDB_DATA equal?
+ * @a: one NTDB_DATA
+ * @b: another NTDB_DATA
+ */
+static inline bool ntdb_deq(NTDB_DATA a, NTDB_DATA b)
+{
+	return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0;
+}
+
+/**
+ * ntdb_mkdata - make a NTDB_DATA from const data
+ * @p: the constant pointer
+ * @len: the length
+ *
+ * As the dptr member of NTDB_DATA is not constant, you need to
+ * cast it.  This function keeps thost casts in one place, as well as
+ * suppressing the warning some compilers give when casting away a
+ * qualifier (eg. gcc with -Wcast-qual)
+ */
+static inline NTDB_DATA ntdb_mkdata(const void *p, size_t len)
+{
+	NTDB_DATA d;
+	d.dptr = cast_const(void *, p);
+	d.dsize = len;
+	return d;
+}
+
+/**
+ * ntdb_transaction_start - start a transaction
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This begins a series of atomic operations.  Other processes will be able
+ * to read the ntdb, but not alter it (they will block), nor will they see
+ * any changes until ntdb_transaction_commit() is called.
+ *
+ * Note that if the NTDB_ALLOW_NESTING flag is set, a ntdb_transaction_start()
+ * within a transaction will succeed, but it's not a real transaction:
+ * (1) An inner transaction which is committed is not actually committed until
+ *     the outer transaction is; if the outer transaction is cancelled, the
+ *     inner ones are discarded.
+ * (2) ntdb_transaction_cancel() marks the outer transaction as having an error,
+ *     so the final ntdb_transaction_commit() will fail.
+ * (3) the outer transaction will see the results of the inner transaction.
+ *
+ * See Also:
+ *	ntdb_transaction_cancel, ntdb_transaction_commit.
+ */
+enum NTDB_ERROR ntdb_transaction_start(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_transaction_cancel - abandon a transaction
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This aborts a transaction, discarding any changes which were made.
+ * ntdb_close() does this implicitly.
+ */
+void ntdb_transaction_cancel(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_transaction_commit - commit a transaction
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This completes a transaction, writing any changes which were made.
+ *
+ * fsync() is used to commit the transaction (unless NTDB_NOSYNC is set),
+ * making it robust against machine crashes, but very slow compared to
+ * other NTDB operations.
+ *
+ * A failure can only be caused by unexpected errors (eg. I/O or
+ * memory); this is no point looping on transaction failure.
+ *
+ * See Also:
+ *	ntdb_transaction_prepare_commit()
+ */
+enum NTDB_ERROR ntdb_transaction_commit(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_transaction_prepare_commit - prepare to commit a transaction
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This ensures we have the resources to commit a transaction (using
+ * ntdb_transaction_commit): if this succeeds then a transaction will only
+ * fail if the write() or fsync() calls fail.
+ *
+ * If this fails you must still call ntdb_transaction_cancel() to cancel
+ * the transaction.
+ *
+ * See Also:
+ *	ntdb_transaction_commit()
+ */
+enum NTDB_ERROR ntdb_transaction_prepare_commit(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_traverse - traverse a NTDB
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @fn: the function to call for every key/value pair (or NULL)
+ * @p: the pointer to hand to @f
+ *
+ * This walks the NTDB until all they keys have been traversed, or @fn
+ * returns non-zero.  If the traverse function or other processes are
+ * changing data or adding or deleting keys, the traverse may be
+ * unreliable: keys may be skipped or (rarely) visited twice.
+ *
+ * There is one specific exception: the special case of deleting the
+ * current key does not undermine the reliability of the traversal.
+ *
+ * On success, returns the number of keys iterated.  On error returns
+ * a negative enum NTDB_ERROR value.
+ */
+#define ntdb_traverse(ntdb, fn, p)					\
+	ntdb_traverse_(ntdb, typesafe_cb_preargs(int, void *, (fn), (p), \
+						 struct ntdb_context *,	\
+						 NTDB_DATA, NTDB_DATA), (p))
+
+int64_t ntdb_traverse_(struct ntdb_context *ntdb,
+		       int (*fn)(struct ntdb_context *,
+				 NTDB_DATA, NTDB_DATA, void *), void *p);
+
+/**
+ * ntdb_parse_record - operate directly on data in the database.
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key whose record we should hand to @parse
+ * @parse: the function to call for the data
+ * @data: the private pointer to hand to @parse (types must match).
+ *
+ * This avoids a copy for many cases, by handing you a pointer into
+ * the memory-mapped database.  It also locks the record to prevent
+ * other accesses at the same time.
+ *
+ * Do not alter the data handed to parse()!
+ */
+#define ntdb_parse_record(ntdb, key, parse, data)			\
+	ntdb_parse_record_((ntdb), (key),				\
+			   typesafe_cb_preargs(enum NTDB_ERROR, void *,	\
+					       (parse), (data),		\
+					       NTDB_DATA, NTDB_DATA), (data))
+
+enum NTDB_ERROR ntdb_parse_record_(struct ntdb_context *ntdb,
+				   NTDB_DATA key,
+				   enum NTDB_ERROR (*parse)(NTDB_DATA k,
+							    NTDB_DATA d,
+							    void *data),
+				   void *data);
+
+/**
+ * ntdb_get_seqnum - get a database sequence number
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This returns a sequence number: any change to the database from a
+ * ntdb context opened with the NTDB_SEQNUM flag will cause that number
+ * to increment.  Note that the incrementing is unreliable (it is done
+ * without locking), so this is only useful as an optimization.
+ *
+ * For example, you may have a regular database backup routine which
+ * does not operate if the sequence number is unchanged.  In the
+ * unlikely event of a failed increment, it will be backed up next
+ * time any way.
+ *
+ * Returns an enum NTDB_ERROR (ie. negative) on error.
+ */
+int64_t ntdb_get_seqnum(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_firstkey - get the "first" key in a NTDB
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: pointer to key.
+ *
+ * This returns an arbitrary key in the database; with ntdb_nextkey() it allows
+ * open-coded traversal of the database, though it is slightly less efficient
+ * than ntdb_traverse.
+ *
+ * It is your responsibility to free @key->dptr on success.
+ *
+ * Returns NTDB_ERR_NOEXIST if the database is empty.
+ */
+enum NTDB_ERROR ntdb_firstkey(struct ntdb_context *ntdb, NTDB_DATA *key);
+
+/**
+ * ntdb_nextkey - get the "next" key in a NTDB
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: a key returned by ntdb_firstkey() or ntdb_nextkey().
+ *
+ * This returns another key in the database; it will free @key.dptr for
+ * your convenience.
+ *
+ * Returns NTDB_ERR_NOEXIST if there are no more keys.
+ */
+enum NTDB_ERROR ntdb_nextkey(struct ntdb_context *ntdb, NTDB_DATA *key);
+
+/**
+ * ntdb_chainlock - lock a record in the NTDB
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key to lock.
+ *
+ * This prevents any access occurring to a group of keys including @key,
+ * even if @key does not exist.  This allows primitive atomic updates of
+ * records without using transactions.
+ *
+ * You cannot begin a transaction while holding a ntdb_chainlock(), nor can
+ * you do any operations on any other keys in the database.  This also means
+ * that you cannot hold more than one ntdb_chainlock() at a time.
+ *
+ * See Also:
+ *	ntdb_chainunlock()
+ */
+enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key);
+
+/**
+ * ntdb_chainunlock - unlock a record in the NTDB
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key to unlock.
+ *
+ * The key must have previously been locked by ntdb_chainlock().
+ */
+void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key);
+
+/**
+ * ntdb_chainlock_read - lock a record in the NTDB, for reading
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key to lock.
+ *
+ * This prevents any changes from occurring to a group of keys including @key,
+ * even if @key does not exist.  This allows primitive atomic updates of
+ * records without using transactions.
+ *
+ * You cannot begin a transaction while holding a ntdb_chainlock_read(), nor can
+ * you do any operations on any other keys in the database.  This also means
+ * that you cannot hold more than one ntdb_chainlock()/read() at a time.
+ *
+ * See Also:
+ *	ntdb_chainlock()
+ */
+enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb, NTDB_DATA key);
+
+/**
+ * ntdb_chainunlock_read - unlock a record in the NTDB for reading
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @key: the key to unlock.
+ *
+ * The key must have previously been locked by ntdb_chainlock_read().
+ */
+void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key);
+
+/**
+ * ntdb_lockall - lock the entire NTDB
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * You cannot hold a ntdb_chainlock while calling this.  It nests, so you
+ * must call ntdb_unlockall as many times as you call ntdb_lockall.
+ */
+enum NTDB_ERROR ntdb_lockall(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_unlockall - unlock the entire NTDB
+ * @ntdb: the ntdb context returned from ntdb_open()
+ */
+void ntdb_unlockall(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_lockall_read - lock the entire NTDB for reading
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This prevents others writing to the database, eg. ntdb_delete, ntdb_store,
+ * ntdb_append, but not ntdb_fetch.
+ *
+ * You cannot hold a ntdb_chainlock while calling this.  It nests, so you
+ * must call ntdb_unlockall_read as many times as you call ntdb_lockall_read.
+ */
+enum NTDB_ERROR ntdb_lockall_read(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_unlockall_read - unlock the entire NTDB for reading
+ * @ntdb: the ntdb context returned from ntdb_open()
+ */
+void ntdb_unlockall_read(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_wipe_all - wipe the database clean
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * Completely erase the database.  This is faster than iterating through
+ * each key and doing ntdb_delete.
+ */
+enum NTDB_ERROR ntdb_wipe_all(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_repack - repack the database
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This repacks the database; if it is suffering from a great deal of
+ * fragmentation this might help.  However, it can take twice the
+ * memory of the existing NTDB.
+ */
+enum NTDB_ERROR ntdb_repack(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_check - check a NTDB for consistency
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @check: function to check each key/data pair (or NULL)
+ * @data: argument for @check, must match type.
+ *
+ * This performs a consistency check of the open database, optionally calling
+ * a check() function on each record so you can do your own data consistency
+ * checks as well.  If check() returns an error, that is returned from
+ * ntdb_check().
+ *
+ * Note that the NTDB uses a feature which we don't understand which
+ * indicates we can't run ntdb_check(), this will log a warning to that
+ * effect and return NTDB_SUCCESS.  You can detect this condition by
+ * looking for NTDB_CANT_CHECK in ntdb_get_flags().
+ *
+ * Returns NTDB_SUCCESS or an error.
+ */
+#define ntdb_check(ntdb, check, data)					\
+	ntdb_check_((ntdb), typesafe_cb_preargs(enum NTDB_ERROR, void *, \
+						(check), (data),	\
+						NTDB_DATA,		\
+						NTDB_DATA),		\
+		    (data))
+
+enum NTDB_ERROR ntdb_check_(struct ntdb_context *ntdb,
+			    enum NTDB_ERROR (*check)(NTDB_DATA k,
+						     NTDB_DATA d,
+						     void *data),
+			    void *data);
+
+/**
+ * ntdb_error - get the last error (not threadsafe)
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * Returns the last error returned by a NTDB function.
+ *
+ * This makes porting from TDB easier, but note that the last error is not
+ * reliable in threaded programs.
+ */
+enum NTDB_ERROR ntdb_error(struct ntdb_context *ntdb);
+
+/**
+ * enum ntdb_summary_flags - flags for ntdb_summary.
+ */
+enum ntdb_summary_flags {
+	NTDB_SUMMARY_HISTOGRAMS = 1 /* Draw graphs in the summary. */
+};
+
+/**
+ * ntdb_summary - return a string describing the NTDB state
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @flags: flags to control the summary output.
+ * @summary: pointer to string to allocate.
+ *
+ * This returns a developer-readable string describing the overall
+ * state of the ntdb, such as the percentage used and sizes of records.
+ * It is designed to provide information about the ntdb at a glance
+ * without displaying any keys or data in the database.
+ *
+ * On success, sets @summary to point to a malloc()'ed nul-terminated
+ * multi-line string.  It is your responsibility to free() it.
+ */
+enum NTDB_ERROR ntdb_summary(struct ntdb_context *ntdb,
+			     enum ntdb_summary_flags flags,
+			     char **summary);
+
+
+/**
+ * ntdb_get_flags - return the flags for a ntdb
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This returns the flags on the current ntdb.  Some of these are caused by
+ * the flags argument to ntdb_open(), others (such as NTDB_CONVERT) are
+ * intuited.
+ */
+unsigned int ntdb_get_flags(struct ntdb_context *ntdb);
+
+/**
+ * ntdb_add_flag - set a flag for a ntdb
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @flag: one of NTDB_NOLOCK, NTDB_NOMMAP, NTDB_NOSYNC or NTDB_ALLOW_NESTING.
+ *
+ * You can use this to set a flag on the NTDB.  You cannot set these flags
+ * on a NTDB_INTERNAL ntdb.
+ */
+void ntdb_add_flag(struct ntdb_context *ntdb, unsigned flag);
+
+/**
+ * ntdb_remove_flag - unset a flag for a ntdb
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @flag: one of NTDB_NOLOCK, NTDB_NOMMAP, NTDB_NOSYNC or NTDB_ALLOW_NESTING.
+ *
+ * You can use this to clear a flag on the NTDB.  You cannot clear flags
+ * on a NTDB_INTERNAL ntdb.
+ */
+void ntdb_remove_flag(struct ntdb_context *ntdb, unsigned flag);
+
+/**
+ * enum ntdb_attribute_type - descriminator for union ntdb_attribute.
+ */
+enum ntdb_attribute_type {
+	NTDB_ATTRIBUTE_LOG = 0,
+	NTDB_ATTRIBUTE_HASH = 1,
+	NTDB_ATTRIBUTE_SEED = 2,
+	NTDB_ATTRIBUTE_STATS = 3,
+	NTDB_ATTRIBUTE_OPENHOOK = 4,
+	NTDB_ATTRIBUTE_FLOCK = 5,
+};
+
+/**
+ * ntdb_get_attribute - get an attribute for an existing ntdb
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @attr: the union ntdb_attribute to set.
+ *
+ * This gets an attribute from a NTDB which has previously been set (or
+ * may return the default values).  Set @attr.base.attr to the
+ * attribute type you want get.
+ */
+enum NTDB_ERROR ntdb_get_attribute(struct ntdb_context *ntdb,
+				   union ntdb_attribute *attr);
+
+/**
+ * ntdb_set_attribute - set an attribute for an existing ntdb
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @attr: the union ntdb_attribute to set.
+ *
+ * This sets an attribute on a NTDB, overriding any previous attribute
+ * of the same type.  It returns NTDB_ERR_EINVAL if the attribute is
+ * unknown or invalid.
+ *
+ * Note that NTDB_ATTRIBUTE_HASH, NTDB_ATTRIBUTE_SEED, and
+ * NTDB_ATTRIBUTE_OPENHOOK cannot currently be set after ntdb_open.
+ */
+enum NTDB_ERROR ntdb_set_attribute(struct ntdb_context *ntdb,
+				   const union ntdb_attribute *attr);
+
+/**
+ * ntdb_unset_attribute - reset an attribute for an existing ntdb
+ * @ntdb: the ntdb context returned from ntdb_open()
+ * @type: the attribute type to unset.
+ *
+ * This unsets an attribute on a NTDB, returning it to the defaults
+ * (where applicable).
+ *
+ * Note that it only makes sense for NTDB_ATTRIBUTE_LOG and NTDB_ATTRIBUTE_FLOCK
+ * to be unset.
+ */
+void ntdb_unset_attribute(struct ntdb_context *ntdb,
+			  enum ntdb_attribute_type type);
+
+/**
+ * ntdb_name - get the name of a ntdb
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This returns a copy of the name string, made at ntdb_open() time.  If that
+ * argument was NULL (possible for a NTDB_INTERNAL db) this will return NULL.
+ *
+ * This is mostly useful for logging.
+ */
+const char *ntdb_name(const struct ntdb_context *ntdb);
+
+/**
+ * ntdb_fd - get the file descriptor of a ntdb
+ * @ntdb: the ntdb context returned from ntdb_open()
+ *
+ * This returns the file descriptor for the underlying database file, or -1
+ * for NTDB_INTERNAL.
+ */
+int ntdb_fd(const struct ntdb_context *ntdb);
+
+/**
+ * ntdb_foreach - iterate through every open NTDB.
+ * @fn: the function to call for every NTDB
+ * @p: the pointer to hand to @fn
+ *
+ * NTDB internally keeps track of all open TDBs; this function allows you to
+ * iterate through them.  If @fn returns non-zero, traversal stops.
+ */
+#define ntdb_foreach(fn, p)						\
+	ntdb_foreach_(typesafe_cb_preargs(int, void *, (fn), (p),	\
+					  struct ntdb_context *), (p))
+
+void ntdb_foreach_(int (*fn)(struct ntdb_context *, void *), void *p);
+
+/**
+ * struct ntdb_attribute_base - common fields for all ntdb attributes.
+ */
+struct ntdb_attribute_base {
+	enum ntdb_attribute_type attr;
+	union ntdb_attribute *next;
+};
+
+/**
+ * enum ntdb_log_level - log levels for ntdb_attribute_log
+ * @NTDB_LOG_ERROR: used to log unrecoverable errors such as I/O errors
+ *		   or internal consistency failures.
+ * @NTDB_LOG_USE_ERROR: used to log usage errors such as invalid parameters
+ *		   or writing to a read-only database.
+ * @NTDB_LOG_WARNING: used for informational messages on issues which
+ *		     are unusual but handled by NTDB internally, such
+ *		     as a failure to mmap or failure to open /dev/urandom.
+ */
+enum ntdb_log_level {
+	NTDB_LOG_ERROR,
+	NTDB_LOG_USE_ERROR,
+	NTDB_LOG_WARNING
+};
+
+/**
+ * struct ntdb_attribute_log - log function attribute
+ *
+ * This attribute provides a hook for you to log errors.
+ */
+struct ntdb_attribute_log {
+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_LOG */
+	void (*fn)(struct ntdb_context *ntdb,
+		   enum ntdb_log_level level,
+		   enum NTDB_ERROR ecode,
+		   const char *message,
+		   void *data);
+	void *data;
+};
+
+/**
+ * struct ntdb_attribute_hash - hash function attribute
+ *
+ * This attribute allows you to provide an alternative hash function.
+ * This hash function will be handed keys from the database; it will also
+ * be handed the 8-byte NTDB_HASH_MAGIC value for checking the header (the
+ * ntdb_open() will fail if the hash value doesn't match the header).
+ *
+ * Note that if your hash function gives different results on
+ * different machine endians, your ntdb will no longer work across
+ * different architectures!
+ */
+struct ntdb_attribute_hash {
+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_HASH */
+	uint64_t (*fn)(const void *key, size_t len, uint64_t seed,
+		       void *data);
+	void *data;
+};
+
+/**
+ * struct ntdb_attribute_seed - hash function seed attribute
+ *
+ * The hash function seed is normally taken from /dev/urandom (or equivalent)
+ * but can be set manually here.  This is mainly for testing purposes.
+ */
+struct ntdb_attribute_seed {
+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_SEED */
+	uint64_t seed;
+};
+
+/**
+ * struct ntdb_attribute_stats - ntdb operational statistics
+ *
+ * This attribute records statistics of various low-level NTDB operations.
+ * This can be used to assist performance evaluation.  This is only
+ * useful for ntdb_get_attribute().
+ *
+ * New fields will be added at the end, hence the "size" argument which
+ * indicates how large your structure is: it must be filled in before
+ * calling ntdb_get_attribute(), which will overwrite it with the size
+ * ntdb knows about.
+ */
+struct ntdb_attribute_stats {
+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_STATS */
+	size_t size; /* = sizeof(struct ntdb_attribute_stats) */
+	uint64_t allocs;
+	uint64_t   alloc_subhash;
+	uint64_t   alloc_chain;
+	uint64_t   alloc_bucket_exact;
+	uint64_t   alloc_bucket_max;
+	uint64_t   alloc_leftover;
+	uint64_t   alloc_coalesce_tried;
+	uint64_t     alloc_coalesce_iterate_clash;
+	uint64_t     alloc_coalesce_lockfail;
+	uint64_t     alloc_coalesce_race;
+	uint64_t     alloc_coalesce_succeeded;
+	uint64_t       alloc_coalesce_num_merged;
+	uint64_t compares;
+	uint64_t   compare_wrong_bucket;
+	uint64_t   compare_wrong_offsetbits;
+	uint64_t   compare_wrong_keylen;
+	uint64_t   compare_wrong_rechash;
+	uint64_t   compare_wrong_keycmp;
+	uint64_t transactions;
+	uint64_t   transaction_cancel;
+	uint64_t   transaction_nest;
+	uint64_t   transaction_expand_file;
+	uint64_t   transaction_read_direct;
+	uint64_t      transaction_read_direct_fail;
+	uint64_t   transaction_write_direct;
+	uint64_t      transaction_write_direct_fail;
+	uint64_t expands;
+	uint64_t frees;
+	uint64_t locks;
+	uint64_t   lock_lowlevel;
+	uint64_t   lock_nonblock;
+	uint64_t     lock_nonblock_fail;
+};
+
+/**
+ * struct ntdb_attribute_openhook - ntdb special effects hook for open
+ *
+ * This attribute contains a function to call once we have the OPEN_LOCK
+ * for the ntdb, but before we've examined its contents.  If this succeeds,
+ * the ntdb will be populated if it's then zero-length.
+ *
+ * This is a hack to allow support for TDB-style TDB_CLEAR_IF_FIRST
+ * behaviour.
+ */
+struct ntdb_attribute_openhook {
+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_OPENHOOK */
+	enum NTDB_ERROR (*fn)(int fd, void *data);
+	void *data;
+};
+
+/**
+ * struct ntdb_attribute_flock - ntdb special effects hook for file locking
+ *
+ * This attribute contains function to call to place locks on a file; it can
+ * be used to support non-blocking operations or lock proxying.
+ *
+ * They should return 0 on success, -1 on failure and set errno.
+ *
+ * An error will be logged on error if errno is neither EAGAIN nor EINTR
+ * (normally it would only return EAGAIN if waitflag is false, and
+ * loop internally on EINTR).
+ */
+struct ntdb_attribute_flock {
+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_FLOCK */
+	int (*lock)(int fd,int rw, off_t off, off_t len, bool waitflag, void *);
+	int (*unlock)(int fd, int rw, off_t off, off_t len, void *);
+	void *data;
+};
+
+/**
+ * union ntdb_attribute - ntdb attributes.
+ *
+ * This represents all the known attributes.
+ *
+ * See also:
+ *	struct ntdb_attribute_log, struct ntdb_attribute_hash,
+ *	struct ntdb_attribute_seed, struct ntdb_attribute_stats,
+ *	struct ntdb_attribute_openhook, struct ntdb_attribute_flock.
+ */
+union ntdb_attribute {
+	struct ntdb_attribute_base base;
+	struct ntdb_attribute_log log;
+	struct ntdb_attribute_hash hash;
+	struct ntdb_attribute_seed seed;
+	struct ntdb_attribute_stats stats;
+	struct ntdb_attribute_openhook openhook;
+	struct ntdb_attribute_flock flock;
+};
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* ntdb.h */
diff --git a/lib/ntdb/ntdb.pc.in b/lib/ntdb/ntdb.pc.in
new file mode 100644
index 0000000000..36a7d5136c
--- /dev/null
+++ b/lib/ntdb/ntdb.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: ntdb
+Description: A (not-so) trivial database
+Version: @PACKAGE_VERSION@
+Libs: @LIB_RPATH@ -L${libdir} -lntdb
+Cflags: -I${includedir}
+URL: http://tdb.samba.org/
diff --git a/lib/ntdb/open.c b/lib/ntdb/open.c
new file mode 100644
index 0000000000..338de8be8c
--- /dev/null
+++ b/lib/ntdb/open.c
@@ -0,0 +1,768 @@
+ /*
+   Trivial Database 2: opening and closing TDBs
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/build_assert/build_assert.h>
+#include <assert.h>
+
+/* all tdbs, to detect double-opens (fcntl file don't nest!) */
+static struct ntdb_context *tdbs = NULL;
+
+static struct ntdb_file *find_file(dev_t device, ino_t ino)
+{
+	struct ntdb_context *i;
+
+	for (i = tdbs; i; i = i->next) {
+		if (i->file->device == device && i->file->inode == ino) {
+			i->file->refcnt++;
+			return i->file;
+		}
+	}
+	return NULL;
+}
+
+static bool read_all(int fd, void *buf, size_t len)
+{
+	while (len) {
+		ssize_t ret;
+		ret = read(fd, buf, len);
+		if (ret < 0)
+			return false;
+		if (ret == 0) {
+			/* ETOOSHORT? */
+			errno = EWOULDBLOCK;
+			return false;
+		}
+		buf = (char *)buf + ret;
+		len -= ret;
+	}
+	return true;
+}
+
+static uint64_t random_number(struct ntdb_context *ntdb)
+{
+	int fd;
+	uint64_t ret = 0;
+	struct timeval now;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd >= 0) {
+		if (read_all(fd, &ret, sizeof(ret))) {
+			close(fd);
+			return ret;
+		}
+		close(fd);
+	}
+	/* FIXME: Untested!  Based on Wikipedia protocol description! */
+	fd = open("/dev/egd-pool", O_RDWR);
+	if (fd >= 0) {
+		/* Command is 1, next byte is size we want to read. */
+		char cmd[2] = { 1, sizeof(uint64_t) };
+		if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
+			char reply[1 + sizeof(uint64_t)];
+			int r = read(fd, reply, sizeof(reply));
+			if (r > 1) {
+				/* Copy at least some bytes. */
+				memcpy(&ret, reply+1, r - 1);
+				if (reply[0] == sizeof(uint64_t)
+				    && r == sizeof(reply)) {
+					close(fd);
+					return ret;
+				}
+			}
+		}
+		close(fd);
+	}
+
+	/* Fallback: pid and time. */
+	gettimeofday(&now, NULL);
+	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
+	ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
+		   "ntdb_open: random from getpid and time");
+	return ret;
+}
+
+static void ntdb_context_init(struct ntdb_context *ntdb)
+{
+	/* Initialize the NTDB fields here */
+	ntdb_io_init(ntdb);
+	ntdb->direct_access = 0;
+	ntdb->transaction = NULL;
+	ntdb->access = NULL;
+}
+
+struct new_database {
+	struct ntdb_header hdr;
+	struct ntdb_freetable ftable;
+};
+
+/* initialise a new database */
+static enum NTDB_ERROR ntdb_new_database(struct ntdb_context *ntdb,
+				       struct ntdb_attribute_seed *seed,
+				       struct ntdb_header *hdr)
+{
+	/* We make it up in memory, then write it out if not internal */
+	struct new_database newdb;
+	unsigned int magic_len;
+	ssize_t rlen;
+	enum NTDB_ERROR ecode;
+
+	/* Fill in the header */
+	newdb.hdr.version = NTDB_VERSION;
+	if (seed)
+		newdb.hdr.hash_seed = seed->seed;
+	else
+		newdb.hdr.hash_seed = random_number(ntdb);
+	newdb.hdr.hash_test = NTDB_HASH_MAGIC;
+	newdb.hdr.hash_test = ntdb->hash_fn(&newdb.hdr.hash_test,
+					   sizeof(newdb.hdr.hash_test),
+					   newdb.hdr.hash_seed,
+					   ntdb->hash_data);
+	newdb.hdr.recovery = 0;
+	newdb.hdr.features_used = newdb.hdr.features_offered = NTDB_FEATURE_MASK;
+	newdb.hdr.seqnum = 0;
+	newdb.hdr.capabilities = 0;
+	memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved));
+	/* Initial hashes are empty. */
+	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
+
+	/* Free is empty. */
+	newdb.hdr.free_table = offsetof(struct new_database, ftable);
+	memset(&newdb.ftable, 0, sizeof(newdb.ftable));
+	ecode = set_header(NULL, &newdb.ftable.hdr, NTDB_FTABLE_MAGIC, 0,
+			   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
+			   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
+			   0);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* Magic food */
+	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
+	strcpy(newdb.hdr.magic_food, NTDB_MAGIC_FOOD);
+
+	/* This creates an endian-converted database, as if read from disk */
+	magic_len = sizeof(newdb.hdr.magic_food);
+	ntdb_convert(ntdb,
+		    (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len);
+
+	*hdr = newdb.hdr;
+
+	if (ntdb->flags & NTDB_INTERNAL) {
+		ntdb->file->map_size = sizeof(newdb);
+		ntdb->file->map_ptr = malloc(ntdb->file->map_size);
+		if (!ntdb->file->map_ptr) {
+			return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+					  "ntdb_new_database:"
+					  " failed to allocate");
+		}
+		memcpy(ntdb->file->map_ptr, &newdb, ntdb->file->map_size);
+		return NTDB_SUCCESS;
+	}
+	if (lseek(ntdb->file->fd, 0, SEEK_SET) == -1) {
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "ntdb_new_database:"
+				  " failed to seek: %s", strerror(errno));
+	}
+
+	if (ftruncate(ntdb->file->fd, 0) == -1) {
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "ntdb_new_database:"
+				  " failed to truncate: %s", strerror(errno));
+	}
+
+	rlen = write(ntdb->file->fd, &newdb, sizeof(newdb));
+	if (rlen != sizeof(newdb)) {
+		if (rlen >= 0)
+			errno = ENOSPC;
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "ntdb_new_database: %zi writing header: %s",
+				  rlen, strerror(errno));
+	}
+	return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR ntdb_new_file(struct ntdb_context *ntdb)
+{
+	ntdb->file = malloc(sizeof(*ntdb->file));
+	if (!ntdb->file)
+		return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+				  "ntdb_open: cannot alloc ntdb_file structure");
+	ntdb->file->num_lockrecs = 0;
+	ntdb->file->lockrecs = NULL;
+	ntdb->file->allrecord_lock.count = 0;
+	ntdb->file->refcnt = 1;
+	ntdb->file->map_ptr = NULL;
+	return NTDB_SUCCESS;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_set_attribute(struct ntdb_context *ntdb,
+				 const union ntdb_attribute *attr)
+{
+	switch (attr->base.attr) {
+	case NTDB_ATTRIBUTE_LOG:
+		ntdb->log_fn = attr->log.fn;
+		ntdb->log_data = attr->log.data;
+		break;
+	case NTDB_ATTRIBUTE_HASH:
+	case NTDB_ATTRIBUTE_SEED:
+	case NTDB_ATTRIBUTE_OPENHOOK:
+		return ntdb->last_error
+			= ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+				     NTDB_LOG_USE_ERROR,
+				     "ntdb_set_attribute:"
+				     " cannot set %s after opening",
+				     attr->base.attr == NTDB_ATTRIBUTE_HASH
+				     ? "NTDB_ATTRIBUTE_HASH"
+				     : attr->base.attr == NTDB_ATTRIBUTE_SEED
+				     ? "NTDB_ATTRIBUTE_SEED"
+				     : "NTDB_ATTRIBUTE_OPENHOOK");
+	case NTDB_ATTRIBUTE_STATS:
+		return ntdb->last_error
+			= ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+				     NTDB_LOG_USE_ERROR,
+				     "ntdb_set_attribute:"
+				     " cannot set NTDB_ATTRIBUTE_STATS");
+	case NTDB_ATTRIBUTE_FLOCK:
+		ntdb->lock_fn = attr->flock.lock;
+		ntdb->unlock_fn = attr->flock.unlock;
+		ntdb->lock_data = attr->flock.data;
+		break;
+	default:
+		return ntdb->last_error
+			= ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+				     NTDB_LOG_USE_ERROR,
+				     "ntdb_set_attribute:"
+				     " unknown attribute type %u",
+				     attr->base.attr);
+	}
+	return NTDB_SUCCESS;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_get_attribute(struct ntdb_context *ntdb,
+				 union ntdb_attribute *attr)
+{
+	switch (attr->base.attr) {
+	case NTDB_ATTRIBUTE_LOG:
+		if (!ntdb->log_fn)
+			return ntdb->last_error = NTDB_ERR_NOEXIST;
+		attr->log.fn = ntdb->log_fn;
+		attr->log.data = ntdb->log_data;
+		break;
+	case NTDB_ATTRIBUTE_HASH:
+		attr->hash.fn = ntdb->hash_fn;
+		attr->hash.data = ntdb->hash_data;
+		break;
+	case NTDB_ATTRIBUTE_SEED:
+		attr->seed.seed = ntdb->hash_seed;
+		break;
+	case NTDB_ATTRIBUTE_OPENHOOK:
+		if (!ntdb->openhook)
+			return ntdb->last_error = NTDB_ERR_NOEXIST;
+		attr->openhook.fn = ntdb->openhook;
+		attr->openhook.data = ntdb->openhook_data;
+		break;
+	case NTDB_ATTRIBUTE_STATS: {
+		size_t size = attr->stats.size;
+		if (size > ntdb->stats.size)
+			size = ntdb->stats.size;
+		memcpy(&attr->stats, &ntdb->stats, size);
+		break;
+	}
+	case NTDB_ATTRIBUTE_FLOCK:
+		attr->flock.lock = ntdb->lock_fn;
+		attr->flock.unlock = ntdb->unlock_fn;
+		attr->flock.data = ntdb->lock_data;
+		break;
+	default:
+		return ntdb->last_error
+			= ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+				     NTDB_LOG_USE_ERROR,
+				     "ntdb_get_attribute:"
+				     " unknown attribute type %u",
+				     attr->base.attr);
+	}
+	attr->base.next = NULL;
+	return NTDB_SUCCESS;
+}
+
+_PUBLIC_ void ntdb_unset_attribute(struct ntdb_context *ntdb,
+			 enum ntdb_attribute_type type)
+{
+	switch (type) {
+	case NTDB_ATTRIBUTE_LOG:
+		ntdb->log_fn = NULL;
+		break;
+	case NTDB_ATTRIBUTE_OPENHOOK:
+		ntdb->openhook = NULL;
+		break;
+	case NTDB_ATTRIBUTE_HASH:
+	case NTDB_ATTRIBUTE_SEED:
+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
+			   "ntdb_unset_attribute: cannot unset %s after opening",
+			   type == NTDB_ATTRIBUTE_HASH
+			   ? "NTDB_ATTRIBUTE_HASH"
+			   : "NTDB_ATTRIBUTE_SEED");
+		break;
+	case NTDB_ATTRIBUTE_STATS:
+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+			   NTDB_LOG_USE_ERROR,
+			   "ntdb_unset_attribute:"
+			   "cannot unset NTDB_ATTRIBUTE_STATS");
+		break;
+	case NTDB_ATTRIBUTE_FLOCK:
+		ntdb->lock_fn = ntdb_fcntl_lock;
+		ntdb->unlock_fn = ntdb_fcntl_unlock;
+		break;
+	default:
+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+			   NTDB_LOG_USE_ERROR,
+			   "ntdb_unset_attribute: unknown attribute type %u",
+			   type);
+	}
+}
+
+/* The top three bits of the capability tell us whether it matters. */
+enum NTDB_ERROR unknown_capability(struct ntdb_context *ntdb, const char *caller,
+				  ntdb_off_t type)
+{
+	if (type & NTDB_CAP_NOOPEN) {
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "%s: file has unknown capability %llu",
+				  caller, type & NTDB_CAP_NOOPEN);
+	}
+
+	if ((type & NTDB_CAP_NOWRITE) && !(ntdb->flags & NTDB_RDONLY)) {
+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_ERROR,
+				  "%s: file has unknown capability %llu"
+				  " (cannot write to it)",
+				  caller, type & NTDB_CAP_NOOPEN);
+	}
+
+	if (type & NTDB_CAP_NOCHECK) {
+		ntdb->flags |= NTDB_CANT_CHECK;
+	}
+	return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR capabilities_ok(struct ntdb_context *ntdb,
+				      ntdb_off_t capabilities)
+{
+	ntdb_off_t off, next;
+	enum NTDB_ERROR ecode = NTDB_SUCCESS;
+	const struct ntdb_capability *cap;
+
+	/* Check capability list. */
+	for (off = capabilities; off && ecode == NTDB_SUCCESS; off = next) {
+		cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
+		if (NTDB_PTR_IS_ERR(cap)) {
+			return NTDB_PTR_ERR(cap);
+		}
+
+		switch (cap->type & NTDB_CAP_TYPE_MASK) {
+		/* We don't understand any capabilities (yet). */
+		default:
+			ecode = unknown_capability(ntdb, "ntdb_open", cap->type);
+		}
+		next = cap->next;
+		ntdb_access_release(ntdb, cap);
+	}
+	return ecode;
+}
+
+_PUBLIC_ struct ntdb_context *ntdb_open(const char *name, int ntdb_flags,
+			     int open_flags, mode_t mode,
+			     union ntdb_attribute *attr)
+{
+	struct ntdb_context *ntdb;
+	struct stat st;
+	int saved_errno = 0;
+	uint64_t hash_test;
+	unsigned v;
+	ssize_t rlen;
+	struct ntdb_header hdr;
+	struct ntdb_attribute_seed *seed = NULL;
+	ntdb_bool_err berr;
+	enum NTDB_ERROR ecode;
+	int openlock;
+
+	ntdb = malloc(sizeof(*ntdb) + (name ? strlen(name) + 1 : 0));
+	if (!ntdb) {
+		/* Can't log this */
+		errno = ENOMEM;
+		return NULL;
+	}
+	/* Set name immediately for logging functions. */
+	if (name) {
+		ntdb->name = strcpy((char *)(ntdb + 1), name);
+	} else {
+		ntdb->name = NULL;
+	}
+	ntdb->flags = ntdb_flags;
+	ntdb->log_fn = NULL;
+	ntdb->open_flags = open_flags;
+	ntdb->last_error = NTDB_SUCCESS;
+	ntdb->file = NULL;
+	ntdb->openhook = NULL;
+	ntdb->lock_fn = ntdb_fcntl_lock;
+	ntdb->unlock_fn = ntdb_fcntl_unlock;
+	ntdb->hash_fn = ntdb_jenkins_hash;
+	memset(&ntdb->stats, 0, sizeof(ntdb->stats));
+	ntdb->stats.base.attr = NTDB_ATTRIBUTE_STATS;
+	ntdb->stats.size = sizeof(ntdb->stats);
+
+	while (attr) {
+		switch (attr->base.attr) {
+		case NTDB_ATTRIBUTE_HASH:
+			ntdb->hash_fn = attr->hash.fn;
+			ntdb->hash_data = attr->hash.data;
+			break;
+		case NTDB_ATTRIBUTE_SEED:
+			seed = &attr->seed;
+			break;
+		case NTDB_ATTRIBUTE_OPENHOOK:
+			ntdb->openhook = attr->openhook.fn;
+			ntdb->openhook_data = attr->openhook.data;
+			break;
+		default:
+			/* These are set as normal. */
+			ecode = ntdb_set_attribute(ntdb, attr);
+			if (ecode != NTDB_SUCCESS)
+				goto fail;
+		}
+		attr = attr->base.next;
+	}
+
+	if (ntdb_flags & ~(NTDB_INTERNAL | NTDB_NOLOCK | NTDB_NOMMAP | NTDB_CONVERT
+			  | NTDB_NOSYNC | NTDB_SEQNUM | NTDB_ALLOW_NESTING
+			  | NTDB_RDONLY)) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
+				   "ntdb_open: unknown flags %u", ntdb_flags);
+		goto fail;
+	}
+
+	if (seed) {
+		if (!(ntdb_flags & NTDB_INTERNAL) && !(open_flags & O_CREAT)) {
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+					   NTDB_LOG_USE_ERROR,
+					   "ntdb_open:"
+					   " cannot set NTDB_ATTRIBUTE_SEED"
+					   " without O_CREAT.");
+			goto fail;
+		}
+	}
+
+	if ((open_flags & O_ACCMODE) == O_WRONLY) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
+				   "ntdb_open: can't open ntdb %s write-only",
+				   name);
+		goto fail;
+	}
+
+	if ((open_flags & O_ACCMODE) == O_RDONLY) {
+		openlock = F_RDLCK;
+		ntdb->flags |= NTDB_RDONLY;
+	} else {
+		if (ntdb_flags & NTDB_RDONLY) {
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+					   NTDB_LOG_USE_ERROR,
+					   "ntdb_open: can't use NTDB_RDONLY"
+					   " without O_RDONLY");
+			goto fail;
+		}
+		openlock = F_WRLCK;
+	}
+
+	/* internal databases don't need any of the rest. */
+	if (ntdb->flags & NTDB_INTERNAL) {
+		ntdb->flags |= (NTDB_NOLOCK | NTDB_NOMMAP);
+		ecode = ntdb_new_file(ntdb);
+		if (ecode != NTDB_SUCCESS) {
+			goto fail;
+		}
+		ntdb->file->fd = -1;
+		ecode = ntdb_new_database(ntdb, seed, &hdr);
+		if (ecode == NTDB_SUCCESS) {
+			ntdb_convert(ntdb, &hdr.hash_seed,
+				    sizeof(hdr.hash_seed));
+			ntdb->hash_seed = hdr.hash_seed;
+			ntdb_context_init(ntdb);
+			ntdb_ftable_init(ntdb);
+		}
+		if (ecode != NTDB_SUCCESS) {
+			goto fail;
+		}
+		return ntdb;
+	}
+
+	if (stat(name, &st) != -1)
+		ntdb->file = find_file(st.st_dev, st.st_ino);
+
+	if (!ntdb->file) {
+		int fd;
+
+		if ((fd = open(name, open_flags, mode)) == -1) {
+			/* errno set by open(2) */
+			saved_errno = errno;
+			ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				   "ntdb_open: could not open file %s: %s",
+				   name, strerror(errno));
+			goto fail_errno;
+		}
+
+		/* on exec, don't inherit the fd */
+		v = fcntl(fd, F_GETFD, 0);
+		fcntl(fd, F_SETFD, v | FD_CLOEXEC);
+
+		if (fstat(fd, &st) == -1) {
+			saved_errno = errno;
+			ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				   "ntdb_open: could not stat open %s: %s",
+				   name, strerror(errno));
+			close(fd);
+			goto fail_errno;
+		}
+
+		ecode = ntdb_new_file(ntdb);
+		if (ecode != NTDB_SUCCESS) {
+			close(fd);
+			goto fail;
+		}
+
+		ntdb->file->fd = fd;
+		ntdb->file->device = st.st_dev;
+		ntdb->file->inode = st.st_ino;
+		ntdb->file->map_ptr = NULL;
+		ntdb->file->map_size = 0;
+	}
+
+	/* ensure there is only one process initialising at once */
+	ecode = ntdb_lock_open(ntdb, openlock, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
+	if (ecode != NTDB_SUCCESS) {
+		saved_errno = errno;
+		goto fail_errno;
+	}
+
+	/* call their open hook if they gave us one. */
+	if (ntdb->openhook) {
+		ecode = ntdb->openhook(ntdb->file->fd, ntdb->openhook_data);
+		if (ecode != NTDB_SUCCESS) {
+			ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				   "ntdb_open: open hook failed");
+			goto fail;
+		}
+		open_flags |= O_CREAT;
+	}
+
+	/* If they used O_TRUNC, read will return 0. */
+	rlen = pread(ntdb->file->fd, &hdr, sizeof(hdr), 0);
+	if (rlen == 0 && (open_flags & O_CREAT)) {
+		ecode = ntdb_new_database(ntdb, seed, &hdr);
+		if (ecode != NTDB_SUCCESS) {
+			goto fail;
+		}
+	} else if (rlen < 0) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				   "ntdb_open: error %s reading %s",
+				   strerror(errno), name);
+		goto fail;
+	} else if (rlen < sizeof(hdr)
+		   || strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				   "ntdb_open: %s is not a ntdb file", name);
+		goto fail;
+	}
+
+	if (hdr.version != NTDB_VERSION) {
+		if (hdr.version == bswap_64(NTDB_VERSION))
+			ntdb->flags |= NTDB_CONVERT;
+		else {
+			/* wrong version */
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+					   "ntdb_open:"
+					   " %s is unknown version 0x%llx",
+					   name, (long long)hdr.version);
+			goto fail;
+		}
+	} else if (ntdb->flags & NTDB_CONVERT) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				   "ntdb_open:"
+				   " %s does not need NTDB_CONVERT",
+				   name);
+		goto fail;
+	}
+
+	ntdb_context_init(ntdb);
+
+	ntdb_convert(ntdb, &hdr, sizeof(hdr));
+	ntdb->hash_seed = hdr.hash_seed;
+	hash_test = NTDB_HASH_MAGIC;
+	hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
+	if (hdr.hash_test != hash_test) {
+		/* wrong hash variant */
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				   "ntdb_open:"
+				   " %s uses a different hash function",
+				   name);
+		goto fail;
+	}
+
+	ecode = capabilities_ok(ntdb, hdr.capabilities);
+	if (ecode != NTDB_SUCCESS) {
+		goto fail;
+	}
+
+	/* Clear any features we don't understand. */
+	if ((open_flags & O_ACCMODE) != O_RDONLY) {
+		hdr.features_used &= NTDB_FEATURE_MASK;
+		ecode = ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
+							features_used),
+					  &hdr.features_used,
+					  sizeof(hdr.features_used));
+		if (ecode != NTDB_SUCCESS)
+			goto fail;
+	}
+
+	ntdb_unlock_open(ntdb, openlock);
+
+	/* This makes sure we have current map_size and mmap. */
+	ecode = ntdb->io->oob(ntdb, ntdb->file->map_size, 1, true);
+	if (unlikely(ecode != NTDB_SUCCESS))
+		goto fail;
+
+	/* Now it's fully formed, recover if necessary. */
+	berr = ntdb_needs_recovery(ntdb);
+	if (unlikely(berr != false)) {
+		if (berr < 0) {
+			ecode = NTDB_OFF_TO_ERR(berr);
+			goto fail;
+		}
+		ecode = ntdb_lock_and_recover(ntdb);
+		if (ecode != NTDB_SUCCESS) {
+			goto fail;
+		}
+	}
+
+	ecode = ntdb_ftable_init(ntdb);
+	if (ecode != NTDB_SUCCESS) {
+		goto fail;
+	}
+
+	ntdb->next = tdbs;
+	tdbs = ntdb;
+	return ntdb;
+
+ fail:
+	/* Map ecode to some logical errno. */
+	switch (NTDB_ERR_TO_OFF(ecode)) {
+	case NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT):
+	case NTDB_ERR_TO_OFF(NTDB_ERR_IO):
+		saved_errno = EIO;
+		break;
+	case NTDB_ERR_TO_OFF(NTDB_ERR_LOCK):
+		saved_errno = EWOULDBLOCK;
+		break;
+	case NTDB_ERR_TO_OFF(NTDB_ERR_OOM):
+		saved_errno = ENOMEM;
+		break;
+	case NTDB_ERR_TO_OFF(NTDB_ERR_EINVAL):
+		saved_errno = EINVAL;
+		break;
+	default:
+		saved_errno = EINVAL;
+		break;
+	}
+
+fail_errno:
+#ifdef NTDB_TRACE
+	close(ntdb->tracefd);
+#endif
+	if (ntdb->file) {
+		ntdb_lock_cleanup(ntdb);
+		if (--ntdb->file->refcnt == 0) {
+			assert(ntdb->file->num_lockrecs == 0);
+			if (ntdb->file->map_ptr) {
+				if (ntdb->flags & NTDB_INTERNAL) {
+					free(ntdb->file->map_ptr);
+				} else
+					ntdb_munmap(ntdb->file);
+			}
+			if (close(ntdb->file->fd) != 0)
+				ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+					   "ntdb_open: failed to close ntdb fd"
+					   " on error: %s", strerror(errno));
+			free(ntdb->file->lockrecs);
+			free(ntdb->file);
+		}
+	}
+
+	free(ntdb);
+	errno = saved_errno;
+	return NULL;
+}
+
+_PUBLIC_ int ntdb_close(struct ntdb_context *ntdb)
+{
+	int ret = 0;
+	struct ntdb_context **i;
+
+	ntdb_trace(ntdb, "ntdb_close");
+
+	if (ntdb->transaction) {
+		ntdb_transaction_cancel(ntdb);
+	}
+
+	if (ntdb->file->map_ptr) {
+		if (ntdb->flags & NTDB_INTERNAL)
+			free(ntdb->file->map_ptr);
+		else
+			ntdb_munmap(ntdb->file);
+	}
+	if (ntdb->file) {
+		ntdb_lock_cleanup(ntdb);
+		if (--ntdb->file->refcnt == 0) {
+			ret = close(ntdb->file->fd);
+			free(ntdb->file->lockrecs);
+			free(ntdb->file);
+		}
+	}
+
+	/* Remove from tdbs list */
+	for (i = &tdbs; *i; i = &(*i)->next) {
+		if (*i == ntdb) {
+			*i = ntdb->next;
+			break;
+		}
+	}
+
+#ifdef NTDB_TRACE
+	close(ntdb->tracefd);
+#endif
+	free(ntdb);
+
+	return ret;
+}
+
+_PUBLIC_ void ntdb_foreach_(int (*fn)(struct ntdb_context *, void *), void *p)
+{
+	struct ntdb_context *i;
+
+	for (i = tdbs; i; i = i->next) {
+		if (fn(i, p) != 0)
+			break;
+	}
+}
diff --git a/lib/ntdb/private.h b/lib/ntdb/private.h
new file mode 100644
index 0000000000..1cf9b7aca4
--- /dev/null
+++ b/lib/ntdb/private.h
@@ -0,0 +1,657 @@
+#ifndef NTDB_PRIVATE_H
+#define NTDB_PRIVATE_H
+/*
+  Trivial Database 2: private types and prototypes
+  Copyright (C) Rusty Russell 2010
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 3 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "config.h"
+#ifndef HAVE_CCAN
+#error You need ccan to build ntdb!
+#endif
+#include "ntdb.h"
+#include <ccan/compiler/compiler.h>
+#include <ccan/likely/likely.h>
+#include <ccan/endian/endian.h>
+
+#ifdef HAVE_LIBREPLACE
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/shmem.h"
+#include "system/select.h"
+#include "system/wait.h"
+#else
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <utime.h>
+#include <unistd.h>
+#endif
+
+#ifndef TEST_IT
+#define TEST_IT(cond)
+#endif
+
+/* #define NTDB_TRACE 1 */
+
+#ifndef __STRING
+#define __STRING(x)    #x
+#endif
+
+#ifndef __STRINGSTRING
+#define __STRINGSTRING(x) __STRING(x)
+#endif
+
+#ifndef __location__
+#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__)
+#endif
+
+typedef uint64_t ntdb_len_t;
+typedef uint64_t ntdb_off_t;
+
+#define NTDB_MAGIC_FOOD "NTDB file\n"
+#define NTDB_VERSION ((uint64_t)(0x26011967 + 7))
+#define NTDB_USED_MAGIC ((uint64_t)0x1999)
+#define NTDB_HTABLE_MAGIC ((uint64_t)0x1888)
+#define NTDB_CHAIN_MAGIC ((uint64_t)0x1777)
+#define NTDB_FTABLE_MAGIC ((uint64_t)0x1666)
+#define NTDB_CAP_MAGIC ((uint64_t)0x1555)
+#define NTDB_FREE_MAGIC ((uint64_t)0xFE)
+#define NTDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
+#define NTDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
+#define NTDB_RECOVERY_INVALID_MAGIC (0x0ULL)
+
+/* Capability bits. */
+#define NTDB_CAP_TYPE_MASK	0x1FFFFFFFFFFFFFFFULL
+#define NTDB_CAP_NOCHECK		0x8000000000000000ULL
+#define NTDB_CAP_NOWRITE		0x4000000000000000ULL
+#define NTDB_CAP_NOOPEN		0x2000000000000000ULL
+
+#define NTDB_OFF_IS_ERR(off) unlikely(off >= (ntdb_off_t)(long)NTDB_ERR_LAST)
+#define NTDB_OFF_TO_ERR(off) ((enum NTDB_ERROR)(long)(off))
+#define NTDB_ERR_TO_OFF(ecode) ((ntdb_off_t)(long)(ecode))
+
+/* Packing errors into pointers and v.v. */
+#define NTDB_PTR_IS_ERR(ptr)						\
+	unlikely((unsigned long)(ptr) >= (unsigned long)NTDB_ERR_LAST)
+#define NTDB_PTR_ERR(p) ((enum NTDB_ERROR)(long)(p))
+#define NTDB_ERR_PTR(err) ((void *)(long)(err))
+
+/* Common case of returning true, false or -ve error. */
+typedef int ntdb_bool_err;
+
+/* Prevent others from opening the file. */
+#define NTDB_OPEN_LOCK 0
+/* Expanding file. */
+#define NTDB_EXPANSION_LOCK 2
+/* Doing a transaction. */
+#define NTDB_TRANSACTION_LOCK 8
+/* Hash chain locks. */
+#define NTDB_HASH_LOCK_START 64
+
+/* Range for hash locks. */
+#define NTDB_HASH_LOCK_RANGE_BITS 30
+#define NTDB_HASH_LOCK_RANGE (1 << NTDB_HASH_LOCK_RANGE_BITS)
+
+/* We have 1024 entries in the top level. */
+#define NTDB_TOPLEVEL_HASH_BITS 10
+/* And 64 entries in each sub-level: thus 64 bits exactly after 9 levels. */
+#define NTDB_SUBLEVEL_HASH_BITS 6
+/* And 8 entries in each group, ie 8 groups per sublevel. */
+#define NTDB_HASH_GROUP_BITS 3
+/* This is currently 10: beyond this we chain. */
+#define NTDB_MAX_LEVELS (1+(64-NTDB_TOPLEVEL_HASH_BITS) / NTDB_SUBLEVEL_HASH_BITS)
+
+/* Extend file by least 100 times larger than needed. */
+#define NTDB_EXTENSION_FACTOR 100
+
+/* We steal bits from the offsets to store hash info. */
+#define NTDB_OFF_HASH_GROUP_MASK ((1ULL << NTDB_HASH_GROUP_BITS) - 1)
+/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
+#define NTDB_OFF_UPPER_STEAL 8
+#define   NTDB_OFF_UPPER_STEAL_EXTRA 7
+/* The bit number where we store extra hash bits. */
+#define NTDB_OFF_HASH_EXTRA_BIT 57
+#define NTDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
+
+/* Additional features we understand.  Currently: none. */
+#define NTDB_FEATURE_MASK ((uint64_t)0)
+
+/* The bit number where we store the extra hash bits. */
+/* Convenience mask to get actual offset. */
+#define NTDB_OFF_MASK							\
+	(((1ULL << (64 - NTDB_OFF_UPPER_STEAL)) - 1) - NTDB_OFF_HASH_GROUP_MASK)
+
+/* How many buckets in a free list: see size_to_bucket(). */
+#define NTDB_FREE_BUCKETS (64 - NTDB_OFF_UPPER_STEAL)
+
+/* We have to be able to fit a free record here. */
+#define NTDB_MIN_DATA_LEN						\
+	(sizeof(struct ntdb_free_record) - sizeof(struct ntdb_used_record))
+
+/* Indicates this entry is not on an flist (can happen during coalescing) */
+#define NTDB_FTABLE_NONE ((1ULL << NTDB_OFF_UPPER_STEAL) - 1)
+
+struct ntdb_used_record {
+	/* For on-disk compatibility, we avoid bitfields:
+	   magic: 16,        (highest)
+	   key_len_bits: 5,
+	   extra_padding: 32
+	   hash_bits: 11
+	*/
+        uint64_t magic_and_meta;
+	/* The bottom key_len_bits*2 are key length, rest is data length. */
+        uint64_t key_and_data_len;
+};
+
+static inline unsigned rec_key_bits(const struct ntdb_used_record *r)
+{
+	return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2;
+}
+
+static inline uint64_t rec_key_length(const struct ntdb_used_record *r)
+{
+	return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1);
+}
+
+static inline uint64_t rec_data_length(const struct ntdb_used_record *r)
+{
+	return r->key_and_data_len >> rec_key_bits(r);
+}
+
+static inline uint64_t rec_extra_padding(const struct ntdb_used_record *r)
+{
+	return (r->magic_and_meta >> 11) & 0xFFFFFFFF;
+}
+
+static inline uint32_t rec_hash(const struct ntdb_used_record *r)
+{
+	return r->magic_and_meta & ((1 << 11) - 1);
+}
+
+static inline uint16_t rec_magic(const struct ntdb_used_record *r)
+{
+	return (r->magic_and_meta >> 48);
+}
+
+struct ntdb_free_record {
+        uint64_t magic_and_prev; /* NTDB_OFF_UPPER_STEAL bits magic, then prev */
+        uint64_t ftable_and_len; /* Len not counting these two fields. */
+	/* This is why the minimum record size is 8 bytes.  */
+	uint64_t next;
+};
+
+static inline uint64_t frec_prev(const struct ntdb_free_record *f)
+{
+	return f->magic_and_prev & ((1ULL << (64 - NTDB_OFF_UPPER_STEAL)) - 1);
+}
+
+static inline uint64_t frec_magic(const struct ntdb_free_record *f)
+{
+	return f->magic_and_prev >> (64 - NTDB_OFF_UPPER_STEAL);
+}
+
+static inline uint64_t frec_len(const struct ntdb_free_record *f)
+{
+	return f->ftable_and_len & ((1ULL << (64 - NTDB_OFF_UPPER_STEAL))-1);
+}
+
+static inline unsigned frec_ftable(const struct ntdb_free_record *f)
+{
+	return f->ftable_and_len >> (64 - NTDB_OFF_UPPER_STEAL);
+}
+
+struct ntdb_recovery_record {
+	uint64_t magic;
+	/* Length of record (add this header to get total length). */
+	uint64_t max_len;
+	/* Length used. */
+	uint64_t len;
+	/* Old length of file before transaction. */
+	uint64_t eof;
+};
+
+/* If we bottom out of the subhashes, we chain. */
+struct ntdb_chain {
+	ntdb_off_t rec[1 << NTDB_HASH_GROUP_BITS];
+	ntdb_off_t next;
+};
+
+/* this is stored at the front of every database */
+struct ntdb_header {
+	char magic_food[64]; /* for /etc/magic */
+	/* FIXME: Make me 32 bit? */
+	uint64_t version; /* version of the code */
+	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
+	uint64_t hash_seed; /* "random" seed written at creation time. */
+	ntdb_off_t free_table; /* (First) free table. */
+	ntdb_off_t recovery; /* Transaction recovery area. */
+
+	uint64_t features_used; /* Features all writers understand */
+	uint64_t features_offered; /* Features offered */
+
+	uint64_t seqnum; /* Sequence number for NTDB_SEQNUM */
+
+	ntdb_off_t capabilities; /* Optional linked list of capabilities. */
+	ntdb_off_t reserved[22];
+
+	/* Top level hash table. */
+	ntdb_off_t hashtable[1ULL << NTDB_TOPLEVEL_HASH_BITS];
+};
+
+struct ntdb_freetable {
+	struct ntdb_used_record hdr;
+	ntdb_off_t next;
+	ntdb_off_t buckets[NTDB_FREE_BUCKETS];
+};
+
+struct ntdb_capability {
+	struct ntdb_used_record hdr;
+	ntdb_off_t type;
+	ntdb_off_t next;
+	/* ... */
+};
+
+/* Information about a particular (locked) hash entry. */
+struct hash_info {
+	/* Full hash value of entry. */
+	uint64_t h;
+	/* Start and length of lock acquired. */
+	ntdb_off_t hlock_start;
+	ntdb_len_t hlock_range;
+	/* Start of hash group. */
+	ntdb_off_t group_start;
+	/* Bucket we belong in. */
+	unsigned int home_bucket;
+	/* Bucket we (or an empty space) were found in. */
+	unsigned int found_bucket;
+	/* How many bits of the hash are already used. */
+	unsigned int hash_used;
+	/* Current working group. */
+	ntdb_off_t group[1 << NTDB_HASH_GROUP_BITS];
+};
+
+struct traverse_info {
+	struct traverse_level {
+		ntdb_off_t hashtable;
+		/* We ignore groups here, and treat it as a big array. */
+		unsigned entry;
+		unsigned int total_buckets;
+	} levels[NTDB_MAX_LEVELS + 1];
+	unsigned int num_levels;
+	unsigned int toplevel_group;
+	/* This makes delete-everything-inside-traverse work as expected. */
+	ntdb_off_t prev;
+};
+
+enum ntdb_lock_flags {
+	/* WAIT == F_SETLKW, NOWAIT == F_SETLK */
+	NTDB_LOCK_NOWAIT = 0,
+	NTDB_LOCK_WAIT = 1,
+	/* If set, don't log an error on failure. */
+	NTDB_LOCK_PROBE = 2,
+	/* If set, don't check for recovery (used by recovery code). */
+	NTDB_LOCK_NOCHECK = 4,
+};
+
+struct ntdb_lock {
+	struct ntdb_context *owner;
+	off_t off;
+	uint32_t count;
+	uint32_t ltype;
+};
+
+/* This is only needed for ntdb_access_commit, but used everywhere to
+ * simplify. */
+struct ntdb_access_hdr {
+	struct ntdb_access_hdr *next;
+	ntdb_off_t off;
+	ntdb_len_t len;
+	bool convert;
+};
+
+struct ntdb_file {
+	/* How many are sharing us? */
+	unsigned int refcnt;
+
+	/* Mmap (if any), or malloc (for NTDB_INTERNAL). */
+	void *map_ptr;
+
+	/* How much space has been mapped (<= current file size) */
+	ntdb_len_t map_size;
+
+	/* The file descriptor (-1 for NTDB_INTERNAL). */
+	int fd;
+
+	/* Lock information */
+	pid_t locker;
+	struct ntdb_lock allrecord_lock;
+	size_t num_lockrecs;
+	struct ntdb_lock *lockrecs;
+
+	/* Identity of this file. */
+	dev_t device;
+	ino_t inode;
+};
+
+struct ntdb_methods {
+	enum NTDB_ERROR (*tread)(struct ntdb_context *, ntdb_off_t, void *,
+				 ntdb_len_t);
+	enum NTDB_ERROR (*twrite)(struct ntdb_context *, ntdb_off_t, const void *,
+				  ntdb_len_t);
+	enum NTDB_ERROR (*oob)(struct ntdb_context *, ntdb_off_t, ntdb_len_t, bool);
+	enum NTDB_ERROR (*expand_file)(struct ntdb_context *, ntdb_len_t);
+	void *(*direct)(struct ntdb_context *, ntdb_off_t, size_t, bool);
+};
+
+/*
+  internal prototypes
+*/
+/* hash.c: */
+uint64_t ntdb_jenkins_hash(const void *key, size_t length, uint64_t seed,
+			   void *unused);
+
+enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
+			      struct traverse_info *tinfo,
+			      NTDB_DATA *kbuf, size_t *dlen);
+
+enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
+			     struct traverse_info *tinfo,
+			     NTDB_DATA *kbuf, size_t *dlen);
+
+/* Hash random memory. */
+uint64_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len);
+
+/* Hash on disk. */
+uint64_t hash_record(struct ntdb_context *ntdb, ntdb_off_t off);
+
+/* Find and lock a hash entry (or where it would be). */
+ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
+			 NTDB_DATA key,
+			 int ltype,
+			 struct hash_info *h,
+			 struct ntdb_used_record *rec,
+			 struct traverse_info *tinfo);
+
+enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
+				struct hash_info *h,
+				ntdb_off_t new_off);
+
+enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb, struct hash_info *h,
+			    ntdb_off_t new_off);
+
+enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb, struct hash_info *h);
+
+/* For ntdb_check */
+bool is_subhash(ntdb_off_t val);
+enum NTDB_ERROR unknown_capability(struct ntdb_context *ntdb, const char *caller,
+				   ntdb_off_t type);
+
+/* free.c: */
+enum NTDB_ERROR ntdb_ftable_init(struct ntdb_context *ntdb);
+
+/* check.c needs these to iterate through free lists. */
+ntdb_off_t first_ftable(struct ntdb_context *ntdb);
+ntdb_off_t next_ftable(struct ntdb_context *ntdb, ntdb_off_t ftable);
+
+/* This returns space or -ve error number. */
+ntdb_off_t alloc(struct ntdb_context *ntdb, size_t keylen, size_t datalen,
+		 uint64_t hash, unsigned magic, bool growing);
+
+/* Put this record in a free list. */
+enum NTDB_ERROR add_free_record(struct ntdb_context *ntdb,
+				ntdb_off_t off, ntdb_len_t len_with_header,
+				enum ntdb_lock_flags waitflag,
+				bool coalesce_ok);
+
+/* Set up header for a used/ftable/htable/chain/capability record. */
+enum NTDB_ERROR set_header(struct ntdb_context *ntdb,
+			   struct ntdb_used_record *rec,
+			   unsigned magic, uint64_t keylen, uint64_t datalen,
+			   uint64_t actuallen, unsigned hashlow);
+
+/* Used by ntdb_check to verify. */
+unsigned int size_to_bucket(ntdb_len_t data_len);
+ntdb_off_t bucket_off(ntdb_off_t ftable_off, unsigned bucket);
+
+/* Used by ntdb_summary */
+ntdb_off_t dead_space(struct ntdb_context *ntdb, ntdb_off_t off);
+
+/* Adjust expansion, used by create_recovery_area */
+ntdb_off_t ntdb_expand_adjust(ntdb_off_t map_size, ntdb_off_t size);
+
+/* io.c: */
+/* Initialize ntdb->methods. */
+void ntdb_io_init(struct ntdb_context *ntdb);
+
+/* Convert endian of the buffer if required. */
+void *ntdb_convert(const struct ntdb_context *ntdb, void *buf, ntdb_len_t size);
+
+/* Unmap and try to map the ntdb. */
+void ntdb_munmap(struct ntdb_file *file);
+enum NTDB_ERROR ntdb_mmap(struct ntdb_context *ntdb);
+
+/* Either alloc a copy, or give direct access.  Release frees or noop. */
+const void *ntdb_access_read(struct ntdb_context *ntdb,
+			     ntdb_off_t off, ntdb_len_t len, bool convert);
+void *ntdb_access_write(struct ntdb_context *ntdb,
+			ntdb_off_t off, ntdb_len_t len, bool convert);
+
+/* Release result of ntdb_access_read/write. */
+void ntdb_access_release(struct ntdb_context *ntdb, const void *p);
+/* Commit result of ntdb_acces_write. */
+enum NTDB_ERROR ntdb_access_commit(struct ntdb_context *ntdb, void *p);
+
+/* Convenience routine to get an offset. */
+ntdb_off_t ntdb_read_off(struct ntdb_context *ntdb, ntdb_off_t off);
+
+/* Write an offset at an offset. */
+enum NTDB_ERROR ntdb_write_off(struct ntdb_context *ntdb, ntdb_off_t off,
+			       ntdb_off_t val);
+
+/* Clear an ondisk area. */
+enum NTDB_ERROR zero_out(struct ntdb_context *ntdb, ntdb_off_t off, ntdb_len_t len);
+
+/* Return a non-zero offset between >= start < end in this array (or end). */
+ntdb_off_t ntdb_find_nonzero_off(struct ntdb_context *ntdb,
+				 ntdb_off_t base,
+				 uint64_t start,
+				 uint64_t end);
+
+/* Return a zero offset in this array, or num. */
+ntdb_off_t ntdb_find_zero_off(struct ntdb_context *ntdb, ntdb_off_t off,
+			      uint64_t num);
+
+/* Allocate and make a copy of some offset. */
+void *ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset, ntdb_len_t len);
+
+/* Writes a converted copy of a record. */
+enum NTDB_ERROR ntdb_write_convert(struct ntdb_context *ntdb, ntdb_off_t off,
+				   const void *rec, size_t len);
+
+/* Reads record and converts it */
+enum NTDB_ERROR ntdb_read_convert(struct ntdb_context *ntdb, ntdb_off_t off,
+				  void *rec, size_t len);
+
+/* Bump the seqnum (caller checks for ntdb->flags & NTDB_SEQNUM) */
+void ntdb_inc_seqnum(struct ntdb_context *ntdb);
+
+/* lock.c: */
+/* Print message because another ntdb owns a lock we want. */
+enum NTDB_ERROR owner_conflict(struct ntdb_context *ntdb, const char *call);
+
+/* If we fork, we no longer really own locks. */
+bool check_lock_pid(struct ntdb_context *ntdb, const char *call, bool log);
+
+/* Lock/unlock a range of hashes. */
+enum NTDB_ERROR ntdb_lock_hashes(struct ntdb_context *ntdb,
+				 ntdb_off_t hash_lock, ntdb_len_t hash_range,
+				 int ltype, enum ntdb_lock_flags waitflag);
+enum NTDB_ERROR ntdb_unlock_hashes(struct ntdb_context *ntdb,
+				   ntdb_off_t hash_lock,
+				   ntdb_len_t hash_range, int ltype);
+
+/* For closing the file. */
+void ntdb_lock_cleanup(struct ntdb_context *ntdb);
+
+/* Lock/unlock a particular free bucket. */
+enum NTDB_ERROR ntdb_lock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off,
+				      enum ntdb_lock_flags waitflag);
+void ntdb_unlock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off);
+
+/* Serialize transaction start. */
+enum NTDB_ERROR ntdb_transaction_lock(struct ntdb_context *ntdb, int ltype);
+void ntdb_transaction_unlock(struct ntdb_context *ntdb, int ltype);
+
+/* Do we have any hash locks (ie. via ntdb_chainlock) ? */
+bool ntdb_has_hash_locks(struct ntdb_context *ntdb);
+
+/* Lock entire database. */
+enum NTDB_ERROR ntdb_allrecord_lock(struct ntdb_context *ntdb, int ltype,
+				    enum ntdb_lock_flags flags, bool upgradable);
+void ntdb_allrecord_unlock(struct ntdb_context *ntdb, int ltype);
+enum NTDB_ERROR ntdb_allrecord_upgrade(struct ntdb_context *ntdb, off_t start);
+
+/* Serialize db open. */
+enum NTDB_ERROR ntdb_lock_open(struct ntdb_context *ntdb,
+			       int ltype, enum ntdb_lock_flags flags);
+void ntdb_unlock_open(struct ntdb_context *ntdb, int ltype);
+bool ntdb_has_open_lock(struct ntdb_context *ntdb);
+
+/* Serialize db expand. */
+enum NTDB_ERROR ntdb_lock_expand(struct ntdb_context *ntdb, int ltype);
+void ntdb_unlock_expand(struct ntdb_context *ntdb, int ltype);
+bool ntdb_has_expansion_lock(struct ntdb_context *ntdb);
+
+/* If it needs recovery, grab all the locks and do it. */
+enum NTDB_ERROR ntdb_lock_and_recover(struct ntdb_context *ntdb);
+
+/* Default lock and unlock functions. */
+int ntdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, void *);
+int ntdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *);
+
+/* transaction.c: */
+enum NTDB_ERROR ntdb_transaction_recover(struct ntdb_context *ntdb);
+ntdb_bool_err ntdb_needs_recovery(struct ntdb_context *ntdb);
+
+struct ntdb_context {
+	/* Single list of all TDBs, to detect multiple opens. */
+	struct ntdb_context *next;
+
+	/* Filename of the database. */
+	const char *name;
+
+	/* Logging function */
+	void (*log_fn)(struct ntdb_context *ntdb,
+		       enum ntdb_log_level level,
+		       enum NTDB_ERROR ecode,
+		       const char *message,
+		       void *data);
+	void *log_data;
+
+	/* Open flags passed to ntdb_open. */
+	int open_flags;
+
+	/* low level (fnctl) lock functions. */
+	int (*lock_fn)(int fd, int rw, off_t off, off_t len, bool w, void *);
+	int (*unlock_fn)(int fd, int rw, off_t off, off_t len, void *);
+	void *lock_data;
+
+	/* the ntdb flags passed to ntdb_open. */
+	uint32_t flags;
+
+	/* Our statistics. */
+	struct ntdb_attribute_stats stats;
+
+	/* The actual file information */
+	struct ntdb_file *file;
+
+	/* Hash function. */
+	uint64_t (*hash_fn)(const void *key, size_t len, uint64_t seed, void *);
+	void *hash_data;
+	uint64_t hash_seed;
+
+	/* Our open hook, if any. */
+	enum NTDB_ERROR (*openhook)(int fd, void *data);
+	void *openhook_data;
+
+	/* Last error we returned. */
+	enum NTDB_ERROR last_error;
+
+	/* Are we accessing directly? (debugging check). */
+	int direct_access;
+
+	/* Set if we are in a transaction. */
+	struct ntdb_transaction *transaction;
+
+	/* What free table are we using? */
+	ntdb_off_t ftable_off;
+	unsigned int ftable;
+
+	/* IO methods: changes for transactions. */
+	const struct ntdb_methods *io;
+
+	/* Direct access information */
+	struct ntdb_access_hdr *access;
+};
+
+/* ntdb.c: */
+enum NTDB_ERROR COLD PRINTF_FMT(4, 5)
+	ntdb_logerr(struct ntdb_context *ntdb,
+		    enum NTDB_ERROR ecode,
+		    enum ntdb_log_level level,
+		    const char *fmt, ...);
+
+#ifdef NTDB_TRACE
+void ntdb_trace(struct ntdb_context *ntdb, const char *op);
+void ntdb_trace_seqnum(struct ntdb_context *ntdb, uint32_t seqnum, const char *op);
+void ntdb_trace_open(struct ntdb_context *ntdb, const char *op,
+		     unsigned hash_size, unsigned ntdb_flags, unsigned open_flags);
+void ntdb_trace_ret(struct ntdb_context *ntdb, const char *op, int ret);
+void ntdb_trace_retrec(struct ntdb_context *ntdb, const char *op, NTDB_DATA ret);
+void ntdb_trace_1rec(struct ntdb_context *ntdb, const char *op,
+		     NTDB_DATA rec);
+void ntdb_trace_1rec_ret(struct ntdb_context *ntdb, const char *op,
+			 NTDB_DATA rec, int ret);
+void ntdb_trace_1rec_retrec(struct ntdb_context *ntdb, const char *op,
+			    NTDB_DATA rec, NTDB_DATA ret);
+void ntdb_trace_2rec_flag_ret(struct ntdb_context *ntdb, const char *op,
+			      NTDB_DATA rec1, NTDB_DATA rec2, unsigned flag,
+			      int ret);
+void ntdb_trace_2rec_retrec(struct ntdb_context *ntdb, const char *op,
+			    NTDB_DATA rec1, NTDB_DATA rec2, NTDB_DATA ret);
+#else
+#define ntdb_trace(ntdb, op)
+#define ntdb_trace_seqnum(ntdb, seqnum, op)
+#define ntdb_trace_open(ntdb, op, hash_size, ntdb_flags, open_flags)
+#define ntdb_trace_ret(ntdb, op, ret)
+#define ntdb_trace_retrec(ntdb, op, ret)
+#define ntdb_trace_1rec(ntdb, op, rec)
+#define ntdb_trace_1rec_ret(ntdb, op, rec, ret)
+#define ntdb_trace_1rec_retrec(ntdb, op, rec, ret)
+#define ntdb_trace_2rec_flag_ret(ntdb, op, rec1, rec2, flag, ret)
+#define ntdb_trace_2rec_retrec(ntdb, op, rec1, rec2, ret)
+#endif /* !NTDB_TRACE */
+
+#endif
diff --git a/lib/ntdb/pyntdb.c b/lib/ntdb/pyntdb.c
new file mode 100644
index 0000000000..1f80e4227b
--- /dev/null
+++ b/lib/ntdb/pyntdb.c
@@ -0,0 +1,591 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   Python interface to ntdb.  Simply modified from tdb version.
+
+   Copyright (C) 2004-2006 Tim Potter <tpot@samba.org>
+   Copyright (C) 2007-2008 Jelmer Vernooij <jelmer@samba.org>
+   Copyright (C) 2011 Rusty Russell <rusty@rustcorp.com.au>
+
+     ** NOTE! The following LGPL license applies to the ntdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Python.h>
+#include "replace.h"
+#include "system/filesys.h"
+
+#ifndef Py_RETURN_NONE
+#define Py_RETURN_NONE return Py_INCREF(Py_None), Py_None
+#endif
+
+/* Include ntdb headers */
+#include <ntdb.h>
+
+typedef struct {
+	PyObject_HEAD
+	struct ntdb_context *ctx;
+	bool closed;
+} PyNtdbObject;
+
+staticforward PyTypeObject PyNtdb;
+
+static void PyErr_SetTDBError(enum NTDB_ERROR e)
+{
+	PyErr_SetObject(PyExc_RuntimeError,
+		Py_BuildValue("(i,s)", e, ntdb_errorstr(e)));
+}
+
+static NTDB_DATA PyString_AsNtdb_Data(PyObject *data)
+{
+	NTDB_DATA ret;
+	ret.dptr = (unsigned char *)PyString_AsString(data);
+	ret.dsize = PyString_Size(data);
+	return ret;
+}
+
+static PyObject *PyString_FromNtdb_Data(NTDB_DATA data)
+{
+	PyObject *ret = PyString_FromStringAndSize((const char *)data.dptr,
+						   data.dsize);
+	free(data.dptr);
+	return ret;
+}
+
+#define PyErr_NTDB_ERROR_IS_ERR_RAISE(ret) \
+	if (ret != NTDB_SUCCESS) { \
+		PyErr_SetTDBError(ret); \
+		return NULL; \
+	}
+
+static void stderr_log(struct ntdb_context *ntdb,
+		       enum ntdb_log_level level,
+		       enum NTDB_ERROR ecode,
+		       const char *message,
+		       void *data)
+{
+	fprintf(stderr, "%s:%s:%s\n",
+		ntdb_name(ntdb), ntdb_errorstr(ecode), message);
+}
+
+static PyObject *py_ntdb_open(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+	char *name = NULL;
+	int ntdb_flags = NTDB_DEFAULT, flags = O_RDWR, mode = 0600;
+	struct ntdb_context *ctx;
+	PyNtdbObject *ret;
+	union ntdb_attribute logattr;
+	const char *kwnames[] = { "name", "ntdb_flags", "flags", "mode", NULL };
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|siii", cast_const2(char **, kwnames), &name, &ntdb_flags, &flags, &mode))
+		return NULL;
+
+	if (name == NULL) {
+		ntdb_flags |= NTDB_INTERNAL;
+	}
+
+	logattr.log.base.attr = NTDB_ATTRIBUTE_LOG;
+	logattr.log.base.next = NULL;
+	logattr.log.fn = stderr_log;
+	ctx = ntdb_open(name, ntdb_flags, flags, mode, &logattr);
+	if (ctx == NULL) {
+		PyErr_SetFromErrno(PyExc_IOError);
+		return NULL;
+	}
+
+	ret = PyObject_New(PyNtdbObject, &PyNtdb);
+	if (!ret) {
+		ntdb_close(ctx);
+		return NULL;
+	}
+
+	ret->ctx = ctx;
+	ret->closed = false;
+	return (PyObject *)ret;
+}
+
+static PyObject *obj_transaction_cancel(PyNtdbObject *self)
+{
+	ntdb_transaction_cancel(self->ctx);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_commit(PyNtdbObject *self)
+{
+	enum NTDB_ERROR ret = ntdb_transaction_commit(self->ctx);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_prepare_commit(PyNtdbObject *self)
+{
+	enum NTDB_ERROR ret = ntdb_transaction_prepare_commit(self->ctx);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_start(PyNtdbObject *self)
+{
+	enum NTDB_ERROR ret = ntdb_transaction_start(self->ctx);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_lockall(PyNtdbObject *self)
+{
+	enum NTDB_ERROR ret = ntdb_lockall(self->ctx);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_unlockall(PyNtdbObject *self)
+{
+	ntdb_unlockall(self->ctx);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_lockall_read(PyNtdbObject *self)
+{
+	enum NTDB_ERROR ret = ntdb_lockall_read(self->ctx);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_unlockall_read(PyNtdbObject *self)
+{
+	ntdb_unlockall_read(self->ctx);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_close(PyNtdbObject *self)
+{
+	int ret;
+	if (self->closed)
+		Py_RETURN_NONE;
+	ret = ntdb_close(self->ctx);
+	self->closed = true;
+	if (ret != 0) {
+		PyErr_SetTDBError(NTDB_ERR_IO);
+		return NULL;
+	}
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_get(PyNtdbObject *self, PyObject *args)
+{
+	NTDB_DATA key, data;
+	PyObject *py_key;
+	enum NTDB_ERROR ret;
+	if (!PyArg_ParseTuple(args, "O", &py_key))
+		return NULL;
+
+	key = PyString_AsNtdb_Data(py_key);
+	ret = ntdb_fetch(self->ctx, key, &data);
+	if (ret == NTDB_ERR_NOEXIST)
+		Py_RETURN_NONE;
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	return PyString_FromNtdb_Data(data);
+}
+
+static PyObject *obj_append(PyNtdbObject *self, PyObject *args)
+{
+	NTDB_DATA key, data;
+	PyObject *py_key, *py_data;
+	enum NTDB_ERROR ret;
+	if (!PyArg_ParseTuple(args, "OO", &py_key, &py_data))
+		return NULL;
+
+	key = PyString_AsNtdb_Data(py_key);
+	data = PyString_AsNtdb_Data(py_data);
+
+	ret = ntdb_append(self->ctx, key, data);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_firstkey(PyNtdbObject *self)
+{
+	enum NTDB_ERROR ret;
+	NTDB_DATA key;
+
+	ret = ntdb_firstkey(self->ctx, &key);
+	if (ret == NTDB_ERR_NOEXIST)
+		Py_RETURN_NONE;
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+
+	return PyString_FromNtdb_Data(key);
+}
+
+static PyObject *obj_nextkey(PyNtdbObject *self, PyObject *args)
+{
+	NTDB_DATA key;
+	PyObject *py_key;
+	enum NTDB_ERROR ret;
+	if (!PyArg_ParseTuple(args, "O", &py_key))
+		return NULL;
+
+	/* Malloc here, since ntdb_nextkey frees. */
+	key.dsize = PyString_Size(py_key);
+	key.dptr = malloc(key.dsize);
+	memcpy(key.dptr, PyString_AsString(py_key), key.dsize);
+
+	ret = ntdb_nextkey(self->ctx, &key);
+	if (ret == NTDB_ERR_NOEXIST)
+		Py_RETURN_NONE;
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+
+	return PyString_FromNtdb_Data(key);
+}
+
+static PyObject *obj_delete(PyNtdbObject *self, PyObject *args)
+{
+	NTDB_DATA key;
+	PyObject *py_key;
+	enum NTDB_ERROR ret;
+	if (!PyArg_ParseTuple(args, "O", &py_key))
+		return NULL;
+
+	key = PyString_AsNtdb_Data(py_key);
+	ret = ntdb_delete(self->ctx, key);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_has_key(PyNtdbObject *self, PyObject *args)
+{
+	NTDB_DATA key;
+	PyObject *py_key;
+	if (!PyArg_ParseTuple(args, "O", &py_key))
+		return NULL;
+
+	key = PyString_AsNtdb_Data(py_key);
+	if (ntdb_exists(self->ctx, key))
+		return Py_True;
+	if (ntdb_error(self->ctx) != NTDB_ERR_NOEXIST)
+		PyErr_NTDB_ERROR_IS_ERR_RAISE(ntdb_error(self->ctx));
+	return Py_False;
+}
+
+static PyObject *obj_store(PyNtdbObject *self, PyObject *args)
+{
+	NTDB_DATA key, value;
+	enum NTDB_ERROR ret;
+	int flag = NTDB_REPLACE;
+	PyObject *py_key, *py_value;
+
+	if (!PyArg_ParseTuple(args, "OO|i", &py_key, &py_value, &flag))
+		return NULL;
+
+	key = PyString_AsNtdb_Data(py_key);
+	value = PyString_AsNtdb_Data(py_value);
+
+	ret = ntdb_store(self->ctx, key, value, flag);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_add_flag(PyNtdbObject *self, PyObject *args)
+{
+	unsigned flag;
+
+	if (!PyArg_ParseTuple(args, "I", &flag))
+		return NULL;
+
+	ntdb_add_flag(self->ctx, flag);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_remove_flag(PyNtdbObject *self, PyObject *args)
+{
+	unsigned flag;
+
+	if (!PyArg_ParseTuple(args, "I", &flag))
+		return NULL;
+
+	ntdb_remove_flag(self->ctx, flag);
+	Py_RETURN_NONE;
+}
+
+typedef struct {
+	PyObject_HEAD
+	NTDB_DATA current;
+	bool end;
+	PyNtdbObject *iteratee;
+} PyNtdbIteratorObject;
+
+static PyObject *ntdb_iter_next(PyNtdbIteratorObject *self)
+{
+	enum NTDB_ERROR e;
+	PyObject *ret;
+	if (self->end)
+		return NULL;
+	ret = PyString_FromStringAndSize((const char *)self->current.dptr,
+					 self->current.dsize);
+	e = ntdb_nextkey(self->iteratee->ctx, &self->current);
+	if (e == NTDB_ERR_NOEXIST)
+		self->end = true;
+	else
+		PyErr_NTDB_ERROR_IS_ERR_RAISE(e);
+	return ret;
+}
+
+static void ntdb_iter_dealloc(PyNtdbIteratorObject *self)
+{
+	Py_DECREF(self->iteratee);
+	PyObject_Del(self);
+}
+
+PyTypeObject PyNtdbIterator = {
+	.tp_name = "Iterator",
+	.tp_basicsize = sizeof(PyNtdbIteratorObject),
+	.tp_iternext = (iternextfunc)ntdb_iter_next,
+	.tp_dealloc = (destructor)ntdb_iter_dealloc,
+	.tp_flags = Py_TPFLAGS_DEFAULT,
+	.tp_iter = PyObject_SelfIter,
+};
+
+static PyObject *ntdb_object_iter(PyNtdbObject *self)
+{
+	PyNtdbIteratorObject *ret;
+	enum NTDB_ERROR e;
+
+	ret = PyObject_New(PyNtdbIteratorObject, &PyNtdbIterator);
+	if (!ret)
+		return NULL;
+	e = ntdb_firstkey(self->ctx, &ret->current);
+	if (e == NTDB_ERR_NOEXIST) {
+		ret->end = true;
+	} else {
+		PyErr_NTDB_ERROR_IS_ERR_RAISE(e);
+		ret->end = false;
+	}
+	ret->iteratee = self;
+	Py_INCREF(self);
+	return (PyObject *)ret;
+}
+
+static PyObject *obj_clear(PyNtdbObject *self)
+{
+	enum NTDB_ERROR ret = ntdb_wipe_all(self->ctx);
+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+	Py_RETURN_NONE;
+}
+
+static PyObject *obj_enable_seqnum(PyNtdbObject *self)
+{
+	ntdb_add_flag(self->ctx, NTDB_SEQNUM);
+	Py_RETURN_NONE;
+}
+
+static PyMethodDef ntdb_object_methods[] = {
+	{ "transaction_cancel", (PyCFunction)obj_transaction_cancel, METH_NOARGS,
+		"S.transaction_cancel() -> None\n"
+		"Cancel the currently active transaction." },
+	{ "transaction_commit", (PyCFunction)obj_transaction_commit, METH_NOARGS,
+		"S.transaction_commit() -> None\n"
+		"Commit the currently active transaction." },
+	{ "transaction_prepare_commit", (PyCFunction)obj_transaction_prepare_commit, METH_NOARGS,
+		"S.transaction_prepare_commit() -> None\n"
+		"Prepare to commit the currently active transaction" },
+	{ "transaction_start", (PyCFunction)obj_transaction_start, METH_NOARGS,
+		"S.transaction_start() -> None\n"
+		"Start a new transaction." },
+	{ "lock_all", (PyCFunction)obj_lockall, METH_NOARGS, NULL },
+	{ "unlock_all", (PyCFunction)obj_unlockall, METH_NOARGS, NULL },
+	{ "read_lock_all", (PyCFunction)obj_lockall_read, METH_NOARGS, NULL },
+	{ "read_unlock_all", (PyCFunction)obj_unlockall_read, METH_NOARGS, NULL },
+	{ "close", (PyCFunction)obj_close, METH_NOARGS, NULL },
+	{ "get", (PyCFunction)obj_get, METH_VARARGS, "S.get(key) -> value\n"
+		"Fetch a value." },
+	{ "append", (PyCFunction)obj_append, METH_VARARGS, "S.append(key, value) -> None\n"
+		"Append data to an existing key." },
+	{ "firstkey", (PyCFunction)obj_firstkey, METH_NOARGS, "S.firstkey() -> data\n"
+		"Return the first key in this database." },
+	{ "nextkey", (PyCFunction)obj_nextkey, METH_NOARGS, "S.nextkey(key) -> data\n"
+		"Return the next key in this database." },
+	{ "delete", (PyCFunction)obj_delete, METH_VARARGS, "S.delete(key) -> None\n"
+		"Delete an entry." },
+	{ "has_key", (PyCFunction)obj_has_key, METH_VARARGS, "S.has_key(key) -> None\n"
+		"Check whether key exists in this database." },
+	{ "store", (PyCFunction)obj_store, METH_VARARGS, "S.store(key, data, flag=REPLACE) -> None"
+		"Store data." },
+	{ "add_flag", (PyCFunction)obj_add_flag, METH_VARARGS, "S.add_flag(flag) -> None" },
+	{ "remove_flag", (PyCFunction)obj_remove_flag, METH_VARARGS, "S.remove_flag(flag) -> None" },
+	{ "iterkeys", (PyCFunction)ntdb_object_iter, METH_NOARGS, "S.iterkeys() -> iterator" },
+	{ "clear", (PyCFunction)obj_clear, METH_NOARGS, "S.clear() -> None\n"
+		"Wipe the entire database." },
+	{ "enable_seqnum", (PyCFunction)obj_enable_seqnum, METH_NOARGS,
+		"S.enable_seqnum() -> None" },
+	{ NULL }
+};
+
+static PyObject *obj_get_flags(PyNtdbObject *self, void *closure)
+{
+	return PyInt_FromLong(ntdb_get_flags(self->ctx));
+}
+
+static PyObject *obj_get_filename(PyNtdbObject *self, void *closure)
+{
+	return PyString_FromString(ntdb_name(self->ctx));
+}
+
+static PyObject *obj_get_seqnum(PyNtdbObject *self, void *closure)
+{
+	return PyInt_FromLong(ntdb_get_seqnum(self->ctx));
+}
+
+
+static PyGetSetDef ntdb_object_getsetters[] = {
+	{ cast_const(char *, "flags"), (getter)obj_get_flags, NULL, NULL },
+	{ cast_const(char *, "filename"), (getter)obj_get_filename, NULL,
+	  cast_const(char *, "The filename of this NTDB file.")},
+	{ cast_const(char *, "seqnum"), (getter)obj_get_seqnum, NULL, NULL },
+	{ NULL }
+};
+
+static PyObject *ntdb_object_repr(PyNtdbObject *self)
+{
+	if (ntdb_get_flags(self->ctx) & NTDB_INTERNAL) {
+		return PyString_FromString("Ntdb(<internal>)");
+	} else {
+		return PyString_FromFormat("Ntdb('%s')", ntdb_name(self->ctx));
+	}
+}
+
+static void ntdb_object_dealloc(PyNtdbObject *self)
+{
+	if (!self->closed)
+		ntdb_close(self->ctx);
+	self->ob_type->tp_free(self);
+}
+
+static PyObject *obj_getitem(PyNtdbObject *self, PyObject *key)
+{
+	NTDB_DATA tkey, val;
+	enum NTDB_ERROR ret;
+
+	if (!PyString_Check(key)) {
+		PyErr_SetString(PyExc_TypeError, "Expected string as key");
+		return NULL;
+	}
+
+	tkey.dptr = (unsigned char *)PyString_AsString(key);
+	tkey.dsize = PyString_Size(key);
+
+	ret = ntdb_fetch(self->ctx, tkey, &val);
+	if (ret == NTDB_ERR_NOEXIST) {
+		PyErr_SetString(PyExc_KeyError, "No such NTDB entry");
+		return NULL;
+	} else {
+		PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
+		return PyString_FromNtdb_Data(val);
+	}
+}
+
+static int obj_setitem(PyNtdbObject *self, PyObject *key, PyObject *value)
+{
+	NTDB_DATA tkey, tval;
+	enum NTDB_ERROR ret;
+	if (!PyString_Check(key)) {
+		PyErr_SetString(PyExc_TypeError, "Expected string as key");
+		return -1;
+	}
+
+	tkey = PyString_AsNtdb_Data(key);
+
+	if (value == NULL) {
+		ret = ntdb_delete(self->ctx, tkey);
+	} else {
+		if (!PyString_Check(value)) {
+			PyErr_SetString(PyExc_TypeError, "Expected string as value");
+			return -1;
+		}
+
+		tval = PyString_AsNtdb_Data(value);
+
+		ret = ntdb_store(self->ctx, tkey, tval, NTDB_REPLACE);
+	}
+
+	if (ret != NTDB_SUCCESS) {
+		PyErr_SetTDBError(ret);
+		return -1;
+	}
+
+	return ret;
+}
+
+static PyMappingMethods ntdb_object_mapping = {
+	.mp_subscript = (binaryfunc)obj_getitem,
+	.mp_ass_subscript = (objobjargproc)obj_setitem,
+};
+static PyTypeObject PyNtdb = {
+	.tp_name = "ntdb.Ntdb",
+	.tp_basicsize = sizeof(PyNtdbObject),
+	.tp_methods = ntdb_object_methods,
+	.tp_getset = ntdb_object_getsetters,
+	.tp_new = py_ntdb_open,
+	.tp_doc = "A NTDB file",
+	.tp_repr = (reprfunc)ntdb_object_repr,
+	.tp_dealloc = (destructor)ntdb_object_dealloc,
+	.tp_as_mapping = &ntdb_object_mapping,
+	.tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_ITER,
+	.tp_iter = (getiterfunc)ntdb_object_iter,
+};
+
+static PyMethodDef ntdb_methods[] = {
+	{ "open", (PyCFunction)py_ntdb_open, METH_VARARGS|METH_KEYWORDS, "open(name, hash_size=0, ntdb_flags=NTDB_DEFAULT, flags=O_RDWR, mode=0600)\n"
+		"Open a NTDB file." },
+	{ NULL }
+};
+
+void inittdb(void);
+void inittdb(void)
+{
+	PyObject *m;
+
+	if (PyType_Ready(&PyNtdb) < 0)
+		return;
+
+	if (PyType_Ready(&PyNtdbIterator) < 0)
+		return;
+
+	m = Py_InitModule3("ntdb", ntdb_methods, "NTDB is a simple key-value database similar to GDBM that supports multiple writers.");
+	if (m == NULL)
+		return;
+
+	PyModule_AddObject(m, "REPLACE", PyInt_FromLong(NTDB_REPLACE));
+	PyModule_AddObject(m, "INSERT", PyInt_FromLong(NTDB_INSERT));
+	PyModule_AddObject(m, "MODIFY", PyInt_FromLong(NTDB_MODIFY));
+
+	PyModule_AddObject(m, "DEFAULT", PyInt_FromLong(NTDB_DEFAULT));
+	PyModule_AddObject(m, "INTERNAL", PyInt_FromLong(NTDB_INTERNAL));
+	PyModule_AddObject(m, "NOLOCK", PyInt_FromLong(NTDB_NOLOCK));
+	PyModule_AddObject(m, "NOMMAP", PyInt_FromLong(NTDB_NOMMAP));
+	PyModule_AddObject(m, "CONVERT", PyInt_FromLong(NTDB_CONVERT));
+	PyModule_AddObject(m, "NOSYNC", PyInt_FromLong(NTDB_NOSYNC));
+	PyModule_AddObject(m, "SEQNUM", PyInt_FromLong(NTDB_SEQNUM));
+	PyModule_AddObject(m, "ALLOW_NESTING", PyInt_FromLong(NTDB_ALLOW_NESTING));
+
+	PyModule_AddObject(m, "__docformat__", PyString_FromString("restructuredText"));
+
+	PyModule_AddObject(m, "__version__", PyString_FromString(PACKAGE_VERSION));
+
+	Py_INCREF(&PyNtdb);
+	PyModule_AddObject(m, "Ntdb", (PyObject *)&PyNtdb);
+
+	Py_INCREF(&PyNtdbIterator);
+}
diff --git a/lib/ntdb/summary.c b/lib/ntdb/summary.c
new file mode 100644
index 0000000000..28ffd61df9
--- /dev/null
+++ b/lib/ntdb/summary.c
@@ -0,0 +1,330 @@
+ /*
+   Trivial Database 2: human-readable summary code
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <assert.h>
+#include <ccan/tally/tally.h>
+
+#define SUMMARY_FORMAT \
+	"Size of file/data: %zu/%zu\n" \
+	"Number of records: %zu\n" \
+	"Smallest/average/largest keys: %zu/%zu/%zu\n%s" \
+	"Smallest/average/largest data: %zu/%zu/%zu\n%s" \
+	"Smallest/average/largest padding: %zu/%zu/%zu\n%s" \
+	"Number of free records: %zu\n" \
+	"Smallest/average/largest free records: %zu/%zu/%zu\n%s" \
+	"Number of uncoalesced records: %zu\n" \
+	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
+	"Toplevel hash used: %u of %u\n" \
+	"Number of chains: %zu\n" \
+	"Number of subhashes: %zu\n" \
+	"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
+	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
+
+#define BUCKET_SUMMARY_FORMAT_A					\
+	"Free bucket %zu: total entries %zu.\n"			\
+	"Smallest/average/largest length: %zu/%zu/%zu\n%s"
+#define BUCKET_SUMMARY_FORMAT_B					\
+	"Free bucket %zu-%zu: total entries %zu.\n"		\
+	"Smallest/average/largest length: %zu/%zu/%zu\n%s"
+#define CAPABILITY_FORMAT					\
+	"Capability %llu%s\n"
+
+#define HISTO_WIDTH 70
+#define HISTO_HEIGHT 20
+
+static ntdb_off_t count_hash(struct ntdb_context *ntdb,
+			    ntdb_off_t hash_off, unsigned bits)
+{
+	const ntdb_off_t *h;
+	ntdb_off_t count = 0;
+	unsigned int i;
+
+	h = ntdb_access_read(ntdb, hash_off, sizeof(*h) << bits, true);
+	if (NTDB_PTR_IS_ERR(h)) {
+		return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(h));
+	}
+	for (i = 0; i < (1 << bits); i++)
+		count += (h[i] != 0);
+
+	ntdb_access_release(ntdb, h);
+	return count;
+}
+
+static enum NTDB_ERROR summarize(struct ntdb_context *ntdb,
+				struct tally *hashes,
+				struct tally *ftables,
+				struct tally *fr,
+				struct tally *keys,
+				struct tally *data,
+				struct tally *extra,
+				struct tally *uncoal,
+				struct tally *chains,
+				size_t *num_caps)
+{
+	ntdb_off_t off;
+	ntdb_len_t len;
+	ntdb_len_t unc = 0;
+
+	for (off = sizeof(struct ntdb_header);
+	     off < ntdb->file->map_size;
+	     off += len) {
+		const union {
+			struct ntdb_used_record u;
+			struct ntdb_free_record f;
+			struct ntdb_recovery_record r;
+		} *p;
+		/* We might not be able to get the whole thing. */
+		p = ntdb_access_read(ntdb, off, sizeof(p->f), true);
+		if (NTDB_PTR_IS_ERR(p)) {
+			return NTDB_PTR_ERR(p);
+		}
+		if (frec_magic(&p->f) != NTDB_FREE_MAGIC) {
+			if (unc > 1) {
+				tally_add(uncoal, unc);
+				unc = 0;
+			}
+		}
+
+		if (p->r.magic == NTDB_RECOVERY_INVALID_MAGIC
+		    || p->r.magic == NTDB_RECOVERY_MAGIC) {
+			len = sizeof(p->r) + p->r.max_len;
+		} else if (frec_magic(&p->f) == NTDB_FREE_MAGIC) {
+			len = frec_len(&p->f);
+			tally_add(fr, len);
+			len += sizeof(p->u);
+			unc++;
+		} else if (rec_magic(&p->u) == NTDB_USED_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_key_length(&p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+
+			tally_add(keys, rec_key_length(&p->u));
+			tally_add(data, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == NTDB_HTABLE_MAGIC) {
+			ntdb_off_t count = count_hash(ntdb,
+						     off + sizeof(p->u),
+						     NTDB_SUBLEVEL_HASH_BITS);
+			if (NTDB_OFF_IS_ERR(count)) {
+				return NTDB_OFF_TO_ERR(count);
+			}
+			tally_add(hashes, count);
+			tally_add(extra, rec_extra_padding(&p->u));
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+		} else if (rec_magic(&p->u) == NTDB_FTABLE_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(ftables, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == NTDB_CHAIN_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(chains, 1);
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == NTDB_CAP_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			(*num_caps)++;
+		} else {
+			len = dead_space(ntdb, off);
+			if (NTDB_OFF_IS_ERR(len)) {
+				return NTDB_OFF_TO_ERR(len);
+			}
+		}
+		ntdb_access_release(ntdb, p);
+	}
+	if (unc)
+		tally_add(uncoal, unc);
+	return NTDB_SUCCESS;
+}
+
+static void add_capabilities(struct ntdb_context *ntdb, char *summary)
+{
+	ntdb_off_t off, next;
+	const struct ntdb_capability *cap;
+	size_t count = 0;
+
+	/* Append to summary. */
+	summary += strlen(summary);
+
+	off = ntdb_read_off(ntdb, offsetof(struct ntdb_header, capabilities));
+	if (NTDB_OFF_IS_ERR(off))
+		return;
+
+	/* Walk capability list. */
+	for (; off; off = next) {
+		cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
+		if (NTDB_PTR_IS_ERR(cap)) {
+			break;
+		}
+		count++;
+		sprintf(summary, CAPABILITY_FORMAT,
+			cap->type & NTDB_CAP_TYPE_MASK,
+			/* Noopen?  How did we get here? */
+			(cap->type & NTDB_CAP_NOOPEN) ? " (unopenable)"
+			: ((cap->type & NTDB_CAP_NOWRITE)
+			   && (cap->type & NTDB_CAP_NOCHECK)) ? " (uncheckable,read-only)"
+			: (cap->type & NTDB_CAP_NOWRITE) ? " (read-only)"
+			: (cap->type & NTDB_CAP_NOCHECK) ? " (uncheckable)"
+			: "");
+		summary += strlen(summary);
+		next = cap->next;
+		ntdb_access_release(ntdb, cap);
+	}
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_summary(struct ntdb_context *ntdb,
+			   enum ntdb_summary_flags flags,
+			   char **summary)
+{
+	ntdb_len_t len;
+	size_t num_caps = 0;
+	struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
+		*chains;
+	char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg;
+	enum NTDB_ERROR ecode;
+
+	hashesg = freeg = keysg = datag = extrag = uncoalg = NULL;
+
+	ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb->last_error = ecode;
+	}
+
+	ecode = ntdb_lock_expand(ntdb, F_RDLCK);
+	if (ecode != NTDB_SUCCESS) {
+		ntdb_allrecord_unlock(ntdb, F_RDLCK);
+		return ntdb->last_error = ecode;
+	}
+
+	/* Start stats off empty. */
+	ftables = tally_new(HISTO_HEIGHT);
+	hashes = tally_new(HISTO_HEIGHT);
+	freet = tally_new(HISTO_HEIGHT);
+	keys = tally_new(HISTO_HEIGHT);
+	data = tally_new(HISTO_HEIGHT);
+	extra = tally_new(HISTO_HEIGHT);
+	uncoal = tally_new(HISTO_HEIGHT);
+	chains = tally_new(HISTO_HEIGHT);
+	if (!ftables || !hashes || !freet || !keys || !data || !extra
+	    || !uncoal || !chains) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+				   "ntdb_summary: failed to allocate"
+				   " tally structures");
+		goto unlock;
+	}
+
+	ecode = summarize(ntdb, hashes, ftables, freet, keys, data, extra,
+			  uncoal, chains, &num_caps);
+	if (ecode != NTDB_SUCCESS) {
+		goto unlock;
+	}
+
+	if (flags & NTDB_SUMMARY_HISTOGRAMS) {
+		hashesg = tally_histogram(hashes, HISTO_WIDTH, HISTO_HEIGHT);
+		freeg = tally_histogram(freet, HISTO_WIDTH, HISTO_HEIGHT);
+		keysg = tally_histogram(keys, HISTO_WIDTH, HISTO_HEIGHT);
+		datag = tally_histogram(data, HISTO_WIDTH, HISTO_HEIGHT);
+		extrag = tally_histogram(extra, HISTO_WIDTH, HISTO_HEIGHT);
+		uncoalg = tally_histogram(uncoal, HISTO_WIDTH, HISTO_HEIGHT);
+	}
+
+	/* 20 is max length of a %llu. */
+	len = strlen(SUMMARY_FORMAT) + 33*20 + 1
+		+ (hashesg ? strlen(hashesg) : 0)
+		+ (freeg ? strlen(freeg) : 0)
+		+ (keysg ? strlen(keysg) : 0)
+		+ (datag ? strlen(datag) : 0)
+		+ (extrag ? strlen(extrag) : 0)
+		+ (uncoalg ? strlen(uncoalg) : 0)
+		+ num_caps * (strlen(CAPABILITY_FORMAT) + 20
+			      + strlen(" (uncheckable,read-only)"));
+
+	*summary = malloc(len);
+	if (!*summary) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+				   "ntdb_summary: failed to allocate string");
+		goto unlock;
+	}
+
+	sprintf(*summary, SUMMARY_FORMAT,
+		(size_t)ntdb->file->map_size,
+		tally_total(keys, NULL) + tally_total(data, NULL),
+		tally_num(keys),
+		tally_min(keys), tally_mean(keys), tally_max(keys),
+		keysg ? keysg : "",
+		tally_min(data), tally_mean(data), tally_max(data),
+		datag ? datag : "",
+		tally_min(extra), tally_mean(extra), tally_max(extra),
+		extrag ? extrag : "",
+		tally_num(freet),
+		tally_min(freet), tally_mean(freet), tally_max(freet),
+		freeg ? freeg : "",
+		tally_total(uncoal, NULL),
+		tally_min(uncoal), tally_mean(uncoal), tally_max(uncoal),
+		uncoalg ? uncoalg : "",
+		(unsigned)count_hash(ntdb, offsetof(struct ntdb_header,
+						   hashtable),
+				     NTDB_TOPLEVEL_HASH_BITS),
+		1 << NTDB_TOPLEVEL_HASH_BITS,
+		tally_num(chains),
+		tally_num(hashes),
+		tally_min(hashes), tally_mean(hashes), tally_max(hashes),
+		hashesg ? hashesg : "",
+		tally_total(keys, NULL) * 100.0 / ntdb->file->map_size,
+		tally_total(data, NULL) * 100.0 / ntdb->file->map_size,
+		tally_total(extra, NULL) * 100.0 / ntdb->file->map_size,
+		tally_total(freet, NULL) * 100.0 / ntdb->file->map_size,
+		(tally_num(keys) + tally_num(freet) + tally_num(hashes))
+		* sizeof(struct ntdb_used_record) * 100.0 / ntdb->file->map_size,
+		tally_num(ftables) * sizeof(struct ntdb_freetable)
+		* 100.0 / ntdb->file->map_size,
+		(tally_num(hashes)
+		 * (sizeof(ntdb_off_t) << NTDB_SUBLEVEL_HASH_BITS)
+		 + (sizeof(ntdb_off_t) << NTDB_TOPLEVEL_HASH_BITS)
+		 + sizeof(struct ntdb_chain) * tally_num(chains))
+		* 100.0 / ntdb->file->map_size);
+
+	add_capabilities(ntdb, *summary);
+
+unlock:
+	free(hashesg);
+	free(freeg);
+	free(keysg);
+	free(datag);
+	free(extrag);
+	free(uncoalg);
+	free(hashes);
+	free(freet);
+	free(keys);
+	free(data);
+	free(extra);
+	free(uncoal);
+	free(ftables);
+	free(chains);
+
+	ntdb_allrecord_unlock(ntdb, F_RDLCK);
+	ntdb_unlock_expand(ntdb, F_RDLCK);
+	return ntdb->last_error = ecode;
+}
diff --git a/lib/ntdb/test/api-12-store.c b/lib/ntdb/test/api-12-store.c
new file mode 100644
index 0000000000..24d9498755
--- /dev/null
+++ b/lib/ntdb/test/api-12-store.c
@@ -0,0 +1,57 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <ccan/hash/hash.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "logging.h"
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+	return hash64_stable((const unsigned char *)key, len,
+			     *(uint64_t *)p);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	uint64_t seed = 16014841315512641303ULL;
+	union ntdb_attribute fixed_hattr
+		= { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
+			      .fn = fixedhash,
+			      .data = &seed } };
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
+	NTDB_DATA data = { (unsigned char *)&j, sizeof(j) };
+
+	fixed_hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 500 * 3) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-12-store.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		/* We seemed to lose some keys.
+		 * Insert and check they're in there! */
+		for (j = 0; j < 500; j++) {
+			NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
+			ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
+			ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+			ok1(ntdb_deq(d, data));
+			free(d.dptr);
+		}
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-13-delete.c b/lib/ntdb/test/api-13-delete.c
new file mode 100644
index 0000000000..182252b109
--- /dev/null
+++ b/lib/ntdb/test/api-13-delete.c
@@ -0,0 +1,205 @@
+#include "private.h" // For NTDB_TOPLEVEL_HASH_BITS
+#include <ccan/hash/hash.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "ntdb.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+/* We rig the hash so adjacent-numbered records always clash. */
+static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return ((uint64_t)*(const unsigned int *)key)
+		<< (64 - NTDB_TOPLEVEL_HASH_BITS - 1);
+}
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+	return hash64_stable((const unsigned char *)key, len,
+			     *(uint64_t *)p);
+}
+
+static bool store_records(struct ntdb_context *ntdb)
+{
+	int i;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA d, data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < 1000; i++) {
+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+			return false;
+		ntdb_fetch(ntdb, key, &d);
+		if (!ntdb_deq(d, data))
+			return false;
+		free(d.dptr);
+	}
+	return true;
+}
+
+static void test_val(struct ntdb_context *ntdb, uint64_t val)
+{
+	uint64_t v;
+	NTDB_DATA key = { (unsigned char *)&v, sizeof(v) };
+	NTDB_DATA d, data = { (unsigned char *)&v, sizeof(v) };
+
+	/* Insert an entry, then delete it. */
+	v = val;
+	/* Delete should fail. */
+	ok1(ntdb_delete(ntdb, key) == NTDB_ERR_NOEXIST);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Insert should succeed. */
+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Delete should succeed. */
+	ok1(ntdb_delete(ntdb, key) == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Re-add it, then add collision. */
+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+	v = val + 1;
+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Can find both? */
+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+	v = val;
+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+
+	/* Delete second one. */
+	v = val + 1;
+	ok1(ntdb_delete(ntdb, key) == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Re-add */
+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Now, try deleting first one. */
+	v = val;
+	ok1(ntdb_delete(ntdb, key) == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Can still find second? */
+	v = val + 1;
+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+
+	/* Now, this will be ideally placed. */
+	v = val + 2;
+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* This will collide with both. */
+	v = val;
+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+
+	/* We can still find them all, right? */
+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+	v = val + 1;
+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+	v = val + 2;
+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+
+	/* And if we delete val + 1, that val + 2 should not move! */
+	v = val + 1;
+	ok1(ntdb_delete(ntdb, key) == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	v = val;
+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+	v = val + 2;
+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+	ok1(d.dsize == data.dsize);
+	free(d.dptr);
+
+	/* Delete those two, so we are empty. */
+	ok1(ntdb_delete(ntdb, key) == 0);
+	v = val;
+	ok1(ntdb_delete(ntdb, key) == 0);
+
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	uint64_t seed = 16014841315512641303ULL;
+	union ntdb_attribute clash_hattr
+		= { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
+			      .fn = clash } };
+	union ntdb_attribute fixed_hattr
+		= { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
+			      .fn = fixedhash,
+			      .data = &seed } };
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	/* These two values gave trouble before. */
+	int vals[] = { 755, 837 };
+
+	clash_hattr.base.next = &tap_log_attr;
+	fixed_hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (39 * 3 + 5 + sizeof(vals)/sizeof(vals[0])*2) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-13-delete.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &clash_hattr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		/* Check start of hash table. */
+		test_val(ntdb, 0);
+
+		/* Check end of hash table. */
+		test_val(ntdb, -1ULL);
+
+		/* Check mixed bitpattern. */
+		test_val(ntdb, 0x123456789ABCDEF0ULL);
+
+		ok1(!ntdb->file || (ntdb->file->allrecord_lock.count == 0
+				   && ntdb->file->num_lockrecs == 0));
+		ntdb_close(ntdb);
+
+		/* Deleting these entries in the db gave problems. */
+		ntdb = ntdb_open("run-13-delete.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		ok1(store_records(ntdb));
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		for (j = 0; j < sizeof(vals)/sizeof(vals[0]); j++) {
+			NTDB_DATA key;
+
+			key.dptr = (unsigned char *)&vals[j];
+			key.dsize = sizeof(vals[j]);
+			ok1(ntdb_delete(ntdb, key) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		}
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-14-exists.c b/lib/ntdb/test/api-14-exists.c
new file mode 100644
index 0000000000..88663cad65
--- /dev/null
+++ b/lib/ntdb/test/api-14-exists.c
@@ -0,0 +1,54 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+static bool test_records(struct ntdb_context *ntdb)
+{
+	int i;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < 1000; i++) {
+		if (ntdb_exists(ntdb, key))
+			return false;
+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+			return false;
+		if (!ntdb_exists(ntdb, key))
+			return false;
+	}
+
+	for (i = 0; i < 1000; i++) {
+		if (!ntdb_exists(ntdb, key))
+			return false;
+		if (ntdb_delete(ntdb, key) != 0)
+			return false;
+		if (ntdb_exists(ntdb, key))
+			return false;
+	}
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-14-exists.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (ok1(ntdb))
+			ok1(test_records(ntdb));
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-16-wipe_all.c b/lib/ntdb/test/api-16-wipe_all.c
new file mode 100644
index 0000000000..c1bda8e4f4
--- /dev/null
+++ b/lib/ntdb/test/api-16-wipe_all.c
@@ -0,0 +1,46 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+static bool add_records(struct ntdb_context *ntdb)
+{
+	int i;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < 1000; i++) {
+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+			return false;
+	}
+	return true;
+}
+
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-16-wipe_all.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (ok1(ntdb)) {
+			NTDB_DATA key;
+			ok1(add_records(ntdb));
+			ok1(ntdb_wipe_all(ntdb) == NTDB_SUCCESS);
+			ok1(ntdb_firstkey(ntdb, &key) == NTDB_ERR_NOEXIST);
+			ntdb_close(ntdb);
+		}
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-21-parse_record.c b/lib/ntdb/test/api-21-parse_record.c
new file mode 100644
index 0000000000..fa48562e17
--- /dev/null
+++ b/lib/ntdb/test/api-21-parse_record.c
@@ -0,0 +1,67 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+static enum NTDB_ERROR parse(NTDB_DATA key, NTDB_DATA data, NTDB_DATA *expected)
+{
+	if (!ntdb_deq(data, *expected))
+		return NTDB_ERR_EINVAL;
+	return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR parse_err(NTDB_DATA key, NTDB_DATA data, void *unused)
+{
+	return 100;
+}
+
+static bool test_records(struct ntdb_context *ntdb)
+{
+	int i;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < 1000; i++) {
+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+			return false;
+	}
+
+	for (i = 0; i < 1000; i++) {
+		if (ntdb_parse_record(ntdb, key, parse, &data) != NTDB_SUCCESS)
+			return false;
+	}
+
+	if (ntdb_parse_record(ntdb, key, parse, &data) != NTDB_ERR_NOEXIST)
+		return false;
+
+	/* Test error return from parse function. */
+	i = 0;
+	if (ntdb_parse_record(ntdb, key, parse_err, NULL) != 100)
+		return false;
+
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("api-21-parse_record.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (ok1(ntdb))
+			ok1(test_records(ntdb));
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-55-transaction.c b/lib/ntdb/test/api-55-transaction.c
new file mode 100644
index 0000000000..d51dd0b13e
--- /dev/null
+++ b/lib/ntdb/test/api-55-transaction.c
@@ -0,0 +1,73 @@
+#include "private.h" // struct ntdb_context
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	unsigned char *buffer;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data;
+
+	buffer = malloc(1000);
+	for (i = 0; i < 1000; i++)
+		buffer[i] = i;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 20 + 1);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-55-transaction.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		ok1(ntdb_transaction_start(ntdb) == 0);
+		data.dptr = buffer;
+		data.dsize = 1000;
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
+		ok1(data.dsize == 1000);
+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+		free(data.dptr);
+
+		/* Cancelling a transaction means no store */
+		ntdb_transaction_cancel(ntdb);
+		ok1(ntdb->file->allrecord_lock.count == 0
+		    && ntdb->file->num_lockrecs == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_ERR_NOEXIST);
+
+		/* Commit the transaction. */
+		ok1(ntdb_transaction_start(ntdb) == 0);
+		data.dptr = buffer;
+		data.dsize = 1000;
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
+		ok1(data.dsize == 1000);
+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+		free(data.dptr);
+		ok1(ntdb_transaction_commit(ntdb) == 0);
+		ok1(ntdb->file->allrecord_lock.count == 0
+		    && ntdb->file->num_lockrecs == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
+		ok1(data.dsize == 1000);
+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+		free(data.dptr);
+
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	free(buffer);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-80-tdb_fd.c b/lib/ntdb/test/api-80-tdb_fd.c
new file mode 100644
index 0000000000..39a9df414e
--- /dev/null
+++ b/lib/ntdb/test/api-80-tdb_fd.c
@@ -0,0 +1,32 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("api-80-ntdb_fd.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			continue;
+
+		if (flags[i] & NTDB_INTERNAL)
+			ok1(ntdb_fd(ntdb) == -1);
+		else
+			ok1(ntdb_fd(ntdb) > 2);
+		ntdb_close(ntdb);
+		ok1(tap_log_messages == 0);
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-81-seqnum.c b/lib/ntdb/test/api-81-seqnum.c
new file mode 100644
index 0000000000..93ad53ab07
--- /dev/null
+++ b/lib/ntdb/test/api-81-seqnum.c
@@ -0,0 +1,69 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, seq;
+	struct ntdb_context *ntdb;
+	NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15 + 4 * 13);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("api-81-seqnum.ntdb", flags[i]|NTDB_SEQNUM,
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			continue;
+
+		seq = 0;
+		ok1(ntdb_get_seqnum(ntdb) == seq);
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
+		/* Fetch doesn't change seqnum */
+		if (ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS))
+			free(d.dptr);
+		ok1(ntdb_get_seqnum(ntdb) == seq);
+		ok1(ntdb_append(ntdb, key, data) == NTDB_SUCCESS);
+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
+
+		ok1(ntdb_delete(ntdb, key) == NTDB_SUCCESS);
+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
+		/* Empty append works */
+		ok1(ntdb_append(ntdb, key, data) == NTDB_SUCCESS);
+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
+
+		ok1(ntdb_wipe_all(ntdb) == NTDB_SUCCESS);
+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
+
+		if (!(flags[i] & NTDB_INTERNAL)) {
+			ok1(ntdb_transaction_start(ntdb) == NTDB_SUCCESS);
+			ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+			ok1(ntdb_get_seqnum(ntdb) == ++seq);
+			ok1(ntdb_append(ntdb, key, data) == NTDB_SUCCESS);
+			ok1(ntdb_get_seqnum(ntdb) == ++seq);
+			ok1(ntdb_delete(ntdb, key) == NTDB_SUCCESS);
+			ok1(ntdb_get_seqnum(ntdb) == ++seq);
+			ok1(ntdb_transaction_commit(ntdb) == NTDB_SUCCESS);
+			ok1(ntdb_get_seqnum(ntdb) == seq);
+
+			ok1(ntdb_transaction_start(ntdb) == NTDB_SUCCESS);
+			ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+			ok1(ntdb_get_seqnum(ntdb) == seq + 1);
+			ntdb_transaction_cancel(ntdb);
+			ok1(ntdb_get_seqnum(ntdb) == seq);
+		}
+		ntdb_close(ntdb);
+		ok1(tap_log_messages == 0);
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-82-lockattr.c b/lib/ntdb/test/api-82-lockattr.c
new file mode 100644
index 0000000000..51bb939f59
--- /dev/null
+++ b/lib/ntdb/test/api-82-lockattr.c
@@ -0,0 +1,237 @@
+#include "private.h" // for ntdb_fcntl_unlock
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include "logging.h"
+
+static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
+		  void *_err)
+{
+	int *lock_err = _err;
+	struct flock fl;
+	int ret;
+
+	if (*lock_err) {
+		errno = *lock_err;
+		return -1;
+	}
+
+	do {
+		fl.l_type = rw;
+		fl.l_whence = SEEK_SET;
+		fl.l_start = off;
+		fl.l_len = len;
+
+		if (waitflag)
+			ret = fcntl(fd, F_SETLKW, &fl);
+		else
+			ret = fcntl(fd, F_SETLK, &fl);
+	} while (ret != 0 && errno == EINTR);
+
+	return ret;
+}
+
+static int trav_err;
+static int trav(struct ntdb_context *ntdb, NTDB_DATA k, NTDB_DATA d, int *terr)
+{
+	*terr = trav_err;
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	union ntdb_attribute lock_attr;
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+	int lock_err;
+
+	lock_attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
+	lock_attr.base.next = &tap_log_attr;
+	lock_attr.flock.lock = mylock;
+	lock_attr.flock.unlock = ntdb_fcntl_unlock;
+	lock_attr.flock.data = &lock_err;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 80);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		NTDB_DATA d;
+
+		/* Nonblocking open; expect no error message. */
+		lock_err = EAGAIN;
+		ntdb = ntdb_open("run-82-lockattr.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+		ok(errno == lock_err, "Errno is %u", errno);
+		ok1(!ntdb);
+		ok1(tap_log_messages == 0);
+
+		lock_err = EINTR;
+		ntdb = ntdb_open("run-82-lockattr.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+		ok(errno == lock_err, "Errno is %u", errno);
+		ok1(!ntdb);
+		ok1(tap_log_messages == 0);
+
+		/* Forced fail open. */
+		lock_err = ENOMEM;
+		ntdb = ntdb_open("run-82-lockattr.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+		ok1(errno == lock_err);
+		ok1(!ntdb);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		lock_err = 0;
+		ntdb = ntdb_open("run-82-lockattr.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+		if (!ok1(ntdb))
+			continue;
+		ok1(tap_log_messages == 0);
+
+		/* Nonblocking store. */
+		lock_err = EAGAIN;
+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking fetch. */
+		lock_err = EAGAIN;
+		ok1(!ntdb_exists(ntdb, key));
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(!ntdb_exists(ntdb, key));
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(!ntdb_exists(ntdb, key));
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		lock_err = EAGAIN;
+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking delete. */
+		lock_err = EAGAIN;
+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking locks. */
+		lock_err = EAGAIN;
+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		lock_err = EAGAIN;
+		ok1(ntdb_chainlock_read(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(ntdb_chainlock_read(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(ntdb_chainlock_read(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		lock_err = EAGAIN;
+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_LOCK);
+		/* This actually does divide and conquer. */
+		ok1(tap_log_messages > 0);
+		tap_log_messages = 0;
+
+		lock_err = EAGAIN;
+		ok1(ntdb_lockall_read(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(ntdb_lockall_read(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(ntdb_lockall_read(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages > 0);
+		tap_log_messages = 0;
+
+		/* Nonblocking traverse; go nonblock partway through. */
+		lock_err = 0;
+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
+		trav_err = EAGAIN;
+		ok1(ntdb_traverse(ntdb, trav, &lock_err) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		trav_err = EINTR;
+		lock_err = 0;
+		ok1(ntdb_traverse(ntdb, trav, &lock_err) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		trav_err = ENOMEM;
+		lock_err = 0;
+		ok1(ntdb_traverse(ntdb, trav, &lock_err) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking transactions. */
+		lock_err = EAGAIN;
+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = EINTR;
+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+		lock_err = ENOMEM;
+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
+
+		/* Nonblocking transaction prepare. */
+		lock_err = 0;
+		ok1(ntdb_transaction_start(ntdb) == 0);
+		ok1(ntdb_delete(ntdb, key) == 0);
+
+		lock_err = EAGAIN;
+		ok1(ntdb_transaction_prepare_commit(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+
+		lock_err = 0;
+		ok1(ntdb_transaction_prepare_commit(ntdb) == 0);
+		ok1(ntdb_transaction_commit(ntdb) == 0);
+
+		/* And the transaction was committed, right? */
+		ok1(!ntdb_exists(ntdb, key));
+		ntdb_close(ntdb);
+		ok1(tap_log_messages == 0);
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-83-openhook.c b/lib/ntdb/test/api-83-openhook.c
new file mode 100644
index 0000000000..9f474c9ab8
--- /dev/null
+++ b/lib/ntdb/test/api-83-openhook.c
@@ -0,0 +1,96 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static enum NTDB_ERROR clear_if_first(int fd, void *arg)
+{
+/* We hold a lock offset 4 always, so we can tell if anyone is holding it.
+ * (This is compatible with tdb's TDB_CLEAR_IF_FIRST flag).  */
+	struct flock fl;
+
+	if (arg != clear_if_first)
+		return NTDB_ERR_CORRUPT;
+
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_start = 4;
+	fl.l_len = 1;
+
+	if (fcntl(fd, F_SETLK, &fl) == 0) {
+		/* We must be first ones to open it! */
+		diag("truncating file!");
+		if (ftruncate(fd, 0) != 0) {
+			return NTDB_ERR_IO;
+		}
+	}
+	fl.l_type = F_RDLCK;
+	if (fcntl(fd, F_SETLKW, &fl) != 0) {
+		return NTDB_ERR_IO;
+	}
+	return NTDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	struct agent *agent;
+	union ntdb_attribute cif;
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+
+	cif.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
+	cif.openhook.base.next = &tap_log_attr;
+	cif.openhook.fn = clear_if_first;
+	cif.openhook.data = clear_if_first;
+
+	agent = prepare_external_agent();
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 13);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		/* Create it */
+		ntdb = ntdb_open("run-83-openhook.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
+		ok1(ntdb);
+		ok1(ntdb_store(ntdb, key, key, NTDB_REPLACE) == 0);
+		ntdb_close(ntdb);
+
+		/* Now, open with CIF, should clear it. */
+		ntdb = ntdb_open("run-83-openhook.ntdb", flags[i],
+			       O_RDWR, 0, &cif);
+		ok1(ntdb);
+		ok1(!ntdb_exists(ntdb, key));
+		ok1(ntdb_store(ntdb, key, key, NTDB_REPLACE) == 0);
+
+		/* Agent should not clear it, since it's still open. */
+		ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
+					     "run-83-openhook.ntdb") == SUCCESS);
+		ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
+		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
+
+		/* Still exists for us too. */
+		ok1(ntdb_exists(ntdb, key));
+
+		/* Close it, now agent should clear it. */
+		ntdb_close(ntdb);
+
+		ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
+					     "run-83-openhook.ntdb") == SUCCESS);
+		ok1(external_agent_operation(agent, FETCH, "key") == FAILED);
+		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
+
+		ok1(tap_log_messages == 0);
+	}
+
+	free_external_agent(agent);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-91-get-stats.c b/lib/ntdb/test/api-91-get-stats.c
new file mode 100644
index 0000000000..786885b44c
--- /dev/null
+++ b/lib/ntdb/test/api-91-get-stats.c
@@ -0,0 +1,57 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		union ntdb_attribute *attr;
+		NTDB_DATA key = ntdb_mkdata("key", 3);
+
+		ntdb = ntdb_open("run-91-get-stats.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		ok1(ntdb_store(ntdb, key, key, NTDB_REPLACE) == 0);
+
+		/* Use malloc so valgrind will catch overruns. */
+		attr = malloc(sizeof *attr);
+		attr->stats.base.attr = NTDB_ATTRIBUTE_STATS;
+		attr->stats.size = sizeof(*attr);
+
+		ok1(ntdb_get_attribute(ntdb, attr) == 0);
+		ok1(attr->stats.size == sizeof(*attr));
+		ok1(attr->stats.allocs > 0);
+		ok1(attr->stats.expands > 0);
+		ok1(attr->stats.locks > 0);
+		free(attr);
+
+		/* Try short one. */
+		attr = malloc(offsetof(struct ntdb_attribute_stats, allocs)
+			      + sizeof(attr->stats.allocs));
+		attr->stats.base.attr = NTDB_ATTRIBUTE_STATS;
+		attr->stats.size = offsetof(struct ntdb_attribute_stats, allocs)
+			+ sizeof(attr->stats.allocs);
+		ok1(ntdb_get_attribute(ntdb, attr) == 0);
+		ok1(attr->stats.size == sizeof(*attr));
+		ok1(attr->stats.allocs > 0);
+		free(attr);
+		ok1(tap_log_messages == 0);
+
+		ntdb_close(ntdb);
+
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-92-get-set-readonly.c b/lib/ntdb/test/api-92-get-set-readonly.c
new file mode 100644
index 0000000000..7abd304eef
--- /dev/null
+++ b/lib/ntdb/test/api-92-get-set-readonly.c
@@ -0,0 +1,105 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 48);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		/* RW -> R0 */
+		ntdb = ntdb_open("run-92-get-set-readonly.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		ok1(!(ntdb_get_flags(ntdb) & NTDB_RDONLY));
+
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == NTDB_SUCCESS);
+
+		ntdb_add_flag(ntdb, NTDB_RDONLY);
+		ok1(ntdb_get_flags(ntdb) & NTDB_RDONLY);
+
+		/* Can't store, append, delete. */
+		ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 1);
+		ok1(ntdb_append(ntdb, key, data) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 2);
+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 3);
+
+		/* Can't start a transaction, or any write lock. */
+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 4);
+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 5);
+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 6);
+		ok1(ntdb_wipe_all(ntdb) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 7);
+
+		/* Back to RW. */
+		ntdb_remove_flag(ntdb, NTDB_RDONLY);
+		ok1(!(ntdb_get_flags(ntdb) & NTDB_RDONLY));
+
+		ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY) == NTDB_SUCCESS);
+		ok1(ntdb_append(ntdb, key, data) == NTDB_SUCCESS);
+		ok1(ntdb_delete(ntdb, key) == NTDB_SUCCESS);
+
+		ok1(ntdb_transaction_start(ntdb) == NTDB_SUCCESS);
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == NTDB_SUCCESS);
+		ok1(ntdb_transaction_commit(ntdb) == NTDB_SUCCESS);
+
+		ok1(ntdb_chainlock(ntdb, key) == NTDB_SUCCESS);
+		ntdb_chainunlock(ntdb, key);
+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
+		ntdb_unlockall(ntdb);
+		ok1(ntdb_wipe_all(ntdb) == NTDB_SUCCESS);
+		ok1(tap_log_messages == 7);
+
+		ntdb_close(ntdb);
+
+		/* R0 -> RW */
+		ntdb = ntdb_open("run-92-get-set-readonly.ntdb", flags[i],
+			       O_RDONLY, 0600, &tap_log_attr);
+		ok1(ntdb);
+		ok1(ntdb_get_flags(ntdb) & NTDB_RDONLY);
+
+		/* Can't store, append, delete. */
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 8);
+		ok1(ntdb_append(ntdb, key, data) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 9);
+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 10);
+
+		/* Can't start a transaction, or any write lock. */
+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 11);
+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 12);
+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 13);
+		ok1(ntdb_wipe_all(ntdb) == NTDB_ERR_RDONLY);
+		ok1(tap_log_messages == 14);
+
+		/* Can't remove NTDB_RDONLY since we opened with O_RDONLY */
+		ntdb_remove_flag(ntdb, NTDB_RDONLY);
+		ok1(tap_log_messages == 15);
+		ok1(ntdb_get_flags(ntdb) & NTDB_RDONLY);
+		ntdb_close(ntdb);
+
+		ok1(tap_log_messages == 15);
+		tap_log_messages = 0;
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-93-repack.c b/lib/ntdb/test/api-93-repack.c
new file mode 100644
index 0000000000..168bc24c0a
--- /dev/null
+++ b/lib/ntdb/test/api-93-repack.c
@@ -0,0 +1,80 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+#define NUM_TESTS 1000
+
+static bool store_all(struct ntdb_context *ntdb)
+{
+	unsigned int i;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA dbuf = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < NUM_TESTS; i++) {
+		if (ntdb_store(ntdb, key, dbuf, NTDB_INSERT) != NTDB_SUCCESS)
+			return false;
+	}
+	return true;
+}
+
+static int mark_entry(struct ntdb_context *ntdb,
+		      NTDB_DATA key, NTDB_DATA data, bool found[])
+{
+	unsigned int num;
+
+	if (key.dsize != sizeof(num))
+		return -1;
+	memcpy(&num, key.dptr, key.dsize);
+	if (num >= NUM_TESTS)
+		return -1;
+	if (found[num])
+		return -1;
+	found[num] = true;
+	return 0;
+}
+
+static bool is_all_set(bool found[], unsigned int num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++)
+		if (!found[i])
+			return false;
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	bool found[NUM_TESTS];
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT
+	};
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 6 + 1);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-93-repack.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			break;
+
+		ok1(store_all(ntdb));
+
+		ok1(ntdb_repack(ntdb) == NTDB_SUCCESS);
+		memset(found, 0, sizeof(found));
+		ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+		ok1(ntdb_traverse(ntdb, mark_entry, found) == NUM_TESTS);
+		ok1(is_all_set(found, NUM_TESTS));
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-add-remove-flags.c b/lib/ntdb/test/api-add-remove-flags.c
new file mode 100644
index 0000000000..4888c32f06
--- /dev/null
+++ b/lib/ntdb/test/api-add-remove-flags.c
@@ -0,0 +1,89 @@
+#include "private.h" // for ntdb_context
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(87);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-add-remove-flags.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		ok1(ntdb_get_flags(ntdb) == ntdb->flags);
+		tap_log_messages = 0;
+		ntdb_add_flag(ntdb, NTDB_NOLOCK);
+		if (flags[i] & NTDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(ntdb_get_flags(ntdb) & NTDB_NOLOCK);
+		}
+
+		tap_log_messages = 0;
+		ntdb_add_flag(ntdb, NTDB_NOMMAP);
+		if (flags[i] & NTDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(ntdb_get_flags(ntdb) & NTDB_NOMMAP);
+			ok1(ntdb->file->map_ptr == NULL);
+		}
+
+		tap_log_messages = 0;
+		ntdb_add_flag(ntdb, NTDB_NOSYNC);
+		if (flags[i] & NTDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(ntdb_get_flags(ntdb) & NTDB_NOSYNC);
+		}
+
+		ok1(ntdb_get_flags(ntdb) == ntdb->flags);
+
+		tap_log_messages = 0;
+		ntdb_remove_flag(ntdb, NTDB_NOLOCK);
+		if (flags[i] & NTDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(!(ntdb_get_flags(ntdb) & NTDB_NOLOCK));
+		}
+
+		tap_log_messages = 0;
+		ntdb_remove_flag(ntdb, NTDB_NOMMAP);
+		if (flags[i] & NTDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(!(ntdb_get_flags(ntdb) & NTDB_NOMMAP));
+			ok1(ntdb->file->map_ptr != NULL);
+		}
+
+		tap_log_messages = 0;
+		ntdb_remove_flag(ntdb, NTDB_NOSYNC);
+		if (flags[i] & NTDB_INTERNAL)
+			ok1(tap_log_messages == 1);
+		else {
+			ok1(tap_log_messages == 0);
+			ok1(!(ntdb_get_flags(ntdb) & NTDB_NOSYNC));
+		}
+
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-check-callback.c b/lib/ntdb/test/api-check-callback.c
new file mode 100644
index 0000000000..f74f04b598
--- /dev/null
+++ b/lib/ntdb/test/api-check-callback.c
@@ -0,0 +1,86 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+static bool store_records(struct ntdb_context *ntdb)
+{
+	int i;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < NUM_RECORDS; i++)
+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+			return false;
+	return true;
+}
+
+static enum NTDB_ERROR check(NTDB_DATA key,
+			    NTDB_DATA data,
+			    bool *array)
+{
+	int val;
+
+	if (key.dsize != sizeof(val)) {
+		diag("Wrong key size: %u\n", key.dsize);
+		return NTDB_ERR_CORRUPT;
+	}
+
+	if (key.dsize != data.dsize
+	    || memcmp(key.dptr, data.dptr, sizeof(val)) != 0) {
+		diag("Key and data differ\n");
+		return NTDB_ERR_CORRUPT;
+	}
+
+	memcpy(&val, key.dptr, sizeof(val));
+	if (val >= NUM_RECORDS || val < 0) {
+		diag("check value %i\n", val);
+		return NTDB_ERR_CORRUPT;
+	}
+
+	if (array[val]) {
+		diag("Value %i already seen\n", val);
+		return NTDB_ERR_CORRUPT;
+	}
+
+	array[val] = true;
+	return NTDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		bool array[NUM_RECORDS];
+
+		ntdb = ntdb_open("run-check-callback.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		ok1(store_records(ntdb));
+		for (j = 0; j < NUM_RECORDS; j++)
+			array[j] = false;
+		ok1(ntdb_check(ntdb, check, array) == NTDB_SUCCESS);
+		for (j = 0; j < NUM_RECORDS; j++)
+			if (!array[j])
+				break;
+		ok1(j == NUM_RECORDS);
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-firstkey-nextkey.c b/lib/ntdb/test/api-firstkey-nextkey.c
new file mode 100644
index 0000000000..da1a68043b
--- /dev/null
+++ b/lib/ntdb/test/api-firstkey-nextkey.c
@@ -0,0 +1,159 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+static bool store_records(struct ntdb_context *ntdb)
+{
+	int i;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < NUM_RECORDS; i++)
+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+			return false;
+	return true;
+}
+
+struct trav_data {
+	unsigned int records[NUM_RECORDS];
+	unsigned int calls;
+};
+
+static int trav(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *p)
+{
+	struct trav_data *td = p;
+	int val;
+
+	memcpy(&val, dbuf.dptr, dbuf.dsize);
+	td->records[td->calls++] = val;
+	return 0;
+}
+
+/* Since ntdb_nextkey frees dptr, we need to clone it. */
+static NTDB_DATA dup_key(NTDB_DATA key)
+{
+	void *p = malloc(key.dsize);
+	memcpy(p, key.dptr, key.dsize);
+	key.dptr = p;
+	return key;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	int num;
+	struct trav_data td;
+	NTDB_DATA k;
+	struct ntdb_context *ntdb;
+	union ntdb_attribute seed_attr;
+	enum NTDB_ERROR ecode;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	seed_attr.base.attr = NTDB_ATTRIBUTE_SEED;
+	seed_attr.base.next = &tap_log_attr;
+	seed_attr.seed.seed = 6334326220117065685ULL;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (NUM_RECORDS*6 + (NUM_RECORDS-1)*3 + 22) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("api-firstkey-nextkey.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600,
+			       &seed_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		ok1(ntdb_firstkey(ntdb, &k) == NTDB_ERR_NOEXIST);
+
+		/* One entry... */
+		k.dptr = (unsigned char *)&num;
+		k.dsize = sizeof(num);
+		num = 0;
+		ok1(ntdb_store(ntdb, k, k, NTDB_INSERT) == 0);
+		ok1(ntdb_firstkey(ntdb, &k) == NTDB_SUCCESS);
+		ok1(k.dsize == sizeof(num));
+		ok1(memcmp(k.dptr, &num, sizeof(num)) == 0);
+		ok1(ntdb_nextkey(ntdb, &k) == NTDB_ERR_NOEXIST);
+
+		/* Two entries. */
+		k.dptr = (unsigned char *)&num;
+		k.dsize = sizeof(num);
+		num = 1;
+		ok1(ntdb_store(ntdb, k, k, NTDB_INSERT) == 0);
+		ok1(ntdb_firstkey(ntdb, &k) == NTDB_SUCCESS);
+		ok1(k.dsize == sizeof(num));
+		memcpy(&num, k.dptr, sizeof(num));
+		ok1(num == 0 || num == 1);
+		ok1(ntdb_nextkey(ntdb, &k) == NTDB_SUCCESS);
+		ok1(k.dsize == sizeof(j));
+		memcpy(&j, k.dptr, sizeof(j));
+		ok1(j == 0 || j == 1);
+		ok1(j != num);
+		ok1(ntdb_nextkey(ntdb, &k) == NTDB_ERR_NOEXIST);
+
+		/* Clean up. */
+		k.dptr = (unsigned char *)&num;
+		k.dsize = sizeof(num);
+		num = 0;
+		ok1(ntdb_delete(ntdb, k) == 0);
+		num = 1;
+		ok1(ntdb_delete(ntdb, k) == 0);
+
+		/* Now lots of records. */
+		ok1(store_records(ntdb));
+		td.calls = 0;
+
+		num = ntdb_traverse(ntdb, trav, &td);
+		ok1(num == NUM_RECORDS);
+		ok1(td.calls == NUM_RECORDS);
+
+		/* Simple loop should match ntdb_traverse */
+		for (j = 0, ecode = ntdb_firstkey(ntdb, &k); j < td.calls; j++) {
+			int val;
+
+			ok1(ecode == NTDB_SUCCESS);
+			ok1(k.dsize == sizeof(val));
+			memcpy(&val, k.dptr, k.dsize);
+			ok1(td.records[j] == val);
+			ecode = ntdb_nextkey(ntdb, &k);
+		}
+
+		/* But arbitrary orderings should work too. */
+		for (j = td.calls-1; j > 0; j--) {
+			k.dptr = (unsigned char *)&td.records[j-1];
+			k.dsize = sizeof(td.records[j-1]);
+			k = dup_key(k);
+			ok1(ntdb_nextkey(ntdb, &k) == NTDB_SUCCESS);
+			ok1(k.dsize == sizeof(td.records[j]));
+			ok1(memcmp(k.dptr, &td.records[j], k.dsize) == 0);
+			free(k.dptr);
+		}
+
+		/* Even delete should work. */
+		for (j = 0, ecode = ntdb_firstkey(ntdb, &k);
+		     ecode != NTDB_ERR_NOEXIST;
+		     j++) {
+			ok1(ecode == NTDB_SUCCESS);
+			ok1(k.dsize == 4);
+			ok1(ntdb_delete(ntdb, k) == 0);
+			ecode = ntdb_nextkey(ntdb, &k);
+		}
+
+		diag("delete using first/nextkey gave %u of %u records",
+		     j, NUM_RECORDS);
+		ok1(j == NUM_RECORDS);
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-fork-test.c b/lib/ntdb/test/api-fork-test.c
new file mode 100644
index 0000000000..57bd686282
--- /dev/null
+++ b/lib/ntdb/test/api-fork-test.c
@@ -0,0 +1,179 @@
+/* Test forking while holding lock.
+ *
+ * There are only five ways to do this currently:
+ * (1) grab a ntdb_chainlock, then fork.
+ * (2) grab a ntdb_lockall, then fork.
+ * (3) grab a ntdb_lockall_read, then fork.
+ * (4) start a transaction, then fork.
+ * (5) fork from inside a ntdb_parse() callback.
+ *
+ * Note that we don't hold a lock across ntdb_traverse callbacks, so
+ * that doesn't matter.
+ */
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include "logging.h"
+
+static enum NTDB_ERROR fork_in_parse(NTDB_DATA key, NTDB_DATA data,
+				    struct ntdb_context *ntdb)
+{
+	int status;
+
+	if (fork() == 0) {
+		/* We expect this to fail. */
+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
+			exit(1);
+
+		if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
+			exit(1);
+
+		if (tap_log_messages != 2)
+			exit(2);
+
+		ntdb_close(ntdb);
+		if (tap_log_messages != 2)
+			exit(3);
+		exit(0);
+	}
+	wait(&status);
+	ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+	return NTDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		int status;
+
+		tap_log_messages = 0;
+
+		ntdb = ntdb_open("run-fork-test.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			continue;
+
+		/* Put a record in here. */
+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_SUCCESS);
+
+		ok1(ntdb_chainlock(ntdb, key) == NTDB_SUCCESS);
+		if (fork() == 0) {
+			/* We expect this to fail. */
+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
+				return 1;
+
+			if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
+				return 1;
+
+			if (tap_log_messages != 2)
+				return 2;
+
+			ntdb_chainunlock(ntdb, key);
+			if (tap_log_messages != 3)
+				return 3;
+			ntdb_close(ntdb);
+			if (tap_log_messages != 3)
+				return 4;
+			return 0;
+		}
+		wait(&status);
+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+		ntdb_chainunlock(ntdb, key);
+
+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
+		if (fork() == 0) {
+			/* We expect this to fail. */
+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
+				return 1;
+
+			if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
+				return 1;
+
+			if (tap_log_messages != 2)
+				return 2;
+
+			ntdb_unlockall(ntdb);
+			if (tap_log_messages != 2)
+				return 3;
+			ntdb_close(ntdb);
+			if (tap_log_messages != 2)
+				return 4;
+			return 0;
+		}
+		wait(&status);
+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+		ntdb_unlockall(ntdb);
+
+		ok1(ntdb_lockall_read(ntdb) == NTDB_SUCCESS);
+		if (fork() == 0) {
+			/* We expect this to fail. */
+			/* This would always fail anyway... */
+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
+				return 1;
+
+			if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
+				return 1;
+
+			if (tap_log_messages != 2)
+				return 2;
+
+			ntdb_unlockall_read(ntdb);
+			if (tap_log_messages != 2)
+				return 3;
+			ntdb_close(ntdb);
+			if (tap_log_messages != 2)
+				return 4;
+			return 0;
+		}
+		wait(&status);
+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+		ntdb_unlockall_read(ntdb);
+
+		ok1(ntdb_transaction_start(ntdb) == NTDB_SUCCESS);
+		/* If transactions is empty, noop "commit" succeeds. */
+		ok1(ntdb_delete(ntdb, key) == NTDB_SUCCESS);
+		if (fork() == 0) {
+			/* We expect this to fail. */
+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
+				return 1;
+
+			if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
+				return 1;
+
+			if (tap_log_messages != 2)
+				return 2;
+
+			if (ntdb_transaction_commit(ntdb) != NTDB_ERR_LOCK)
+				return 3;
+
+			ntdb_close(ntdb);
+			if (tap_log_messages < 3)
+				return 4;
+			return 0;
+		}
+		wait(&status);
+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+		ntdb_transaction_cancel(ntdb);
+
+		ok1(ntdb_parse_record(ntdb, key, fork_in_parse, ntdb)
+		    == NTDB_SUCCESS);
+		ntdb_close(ntdb);
+		ok1(tap_log_messages == 0);
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-locktimeout.c b/lib/ntdb/test/api-locktimeout.c
new file mode 100644
index 0000000000..cafe067d0b
--- /dev/null
+++ b/lib/ntdb/test/api-locktimeout.c
@@ -0,0 +1,193 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include "system/wait.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <errno.h>
+#include "logging.h"
+#include "external-agent.h"
+
+#undef alarm
+#define alarm fast_alarm
+
+/* Speed things up by doing things in milliseconds. */
+static unsigned int fast_alarm(unsigned int milli_seconds)
+{
+	struct itimerval it;
+
+	it.it_interval.tv_sec = it.it_interval.tv_usec = 0;
+	it.it_value.tv_sec = milli_seconds / 1000;
+	it.it_value.tv_usec = milli_seconds * 1000;
+	setitimer(ITIMER_REAL, &it, NULL);
+	return 0;
+}
+
+#define CatchSignal(sig, handler) signal((sig), (handler))
+
+static void do_nothing(int signum)
+{
+}
+
+/* This example code is taken from SAMBA, so try not to change it. */
+static struct flock flock_struct;
+
+/* Return a value which is none of v1, v2 or v3. */
+static inline short int invalid_value(short int v1, short int v2, short int v3)
+{
+	short int try = (v1+v2+v3)^((v1+v2+v3) << 16);
+	while (try == v1 || try == v2 || try == v3)
+		try++;
+	return try;
+}
+
+/* We invalidate in as many ways as we can, so the OS rejects it */
+static void invalidate_flock_struct(int signum)
+{
+	flock_struct.l_type = invalid_value(F_RDLCK, F_WRLCK, F_UNLCK);
+	flock_struct.l_whence = invalid_value(SEEK_SET, SEEK_CUR, SEEK_END);
+	flock_struct.l_start = -1;
+	/* A large negative. */
+	flock_struct.l_len = (((off_t)1 << (sizeof(off_t)*CHAR_BIT - 1)) + 1);
+}
+
+static int timeout_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
+			void *_timeout)
+{
+	int ret, saved_errno = errno;
+	unsigned int timeout = *(unsigned int *)_timeout;
+
+	flock_struct.l_type = rw;
+	flock_struct.l_whence = SEEK_SET;
+	flock_struct.l_start = off;
+	flock_struct.l_len = len;
+
+	CatchSignal(SIGALRM, invalidate_flock_struct);
+	alarm(timeout);
+
+	for (;;) {
+		if (waitflag)
+			ret = fcntl(fd, F_SETLKW, &flock_struct);
+		else
+			ret = fcntl(fd, F_SETLK, &flock_struct);
+
+		if (ret == 0)
+			break;
+
+		/* Not signalled?  Something else went wrong. */
+		if (flock_struct.l_len == len) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			saved_errno = errno;
+			break;
+		} else {
+			saved_errno = EINTR;
+			break;
+		}
+	}
+
+	alarm(0);
+	errno = saved_errno;
+	return ret;
+}
+
+static int ntdb_chainlock_with_timeout_internal(struct ntdb_context *ntdb,
+					       NTDB_DATA key,
+					       unsigned int timeout,
+					       int rw_type)
+{
+	union ntdb_attribute locking;
+	enum NTDB_ERROR ecode;
+
+	if (timeout) {
+		locking.base.attr = NTDB_ATTRIBUTE_FLOCK;
+		ecode = ntdb_get_attribute(ntdb, &locking);
+		if (ecode != NTDB_SUCCESS)
+			return ecode;
+
+		/* Replace locking function with our own. */
+		locking.flock.data = &timeout;
+		locking.flock.lock = timeout_lock;
+
+		ecode = ntdb_set_attribute(ntdb, &locking);
+		if (ecode != NTDB_SUCCESS)
+			return ecode;
+	}
+	if (rw_type == F_RDLCK)
+		ecode = ntdb_chainlock_read(ntdb, key);
+	else
+		ecode = ntdb_chainlock(ntdb, key);
+
+	if (timeout) {
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_FLOCK);
+	}
+	return ecode;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	NTDB_DATA key = ntdb_mkdata("hello", 5);
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	struct agent *agent;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15);
+
+	agent = prepare_external_agent();
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		enum NTDB_ERROR ecode;
+		ntdb = ntdb_open("run-locktimeout.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			break;
+
+		/* Simple cases: should succeed. */
+		ecode = ntdb_chainlock_with_timeout_internal(ntdb, key, 20,
+							    F_RDLCK);
+		ok1(ecode == NTDB_SUCCESS);
+		ok1(tap_log_messages == 0);
+
+		ntdb_chainunlock_read(ntdb, key);
+		ok1(tap_log_messages == 0);
+
+		ecode = ntdb_chainlock_with_timeout_internal(ntdb, key, 20,
+							    F_WRLCK);
+		ok1(ecode == NTDB_SUCCESS);
+		ok1(tap_log_messages == 0);
+
+		ntdb_chainunlock(ntdb, key);
+		ok1(tap_log_messages == 0);
+
+		/* OK, get agent to start transaction, then we should time out. */
+		ok1(external_agent_operation(agent, OPEN, "run-locktimeout.ntdb")
+		    == SUCCESS);
+		ok1(external_agent_operation(agent, TRANSACTION_START, "")
+		    == SUCCESS);
+		ecode = ntdb_chainlock_with_timeout_internal(ntdb, key, 20,
+							    F_WRLCK);
+		ok1(ecode == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+
+		/* Even if we get a different signal, should be fine. */
+		CatchSignal(SIGUSR1, do_nothing);
+		external_agent_operation(agent, SEND_SIGNAL, "");
+		ecode = ntdb_chainlock_with_timeout_internal(ntdb, key, 20,
+							    F_WRLCK);
+		ok1(ecode == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 0);
+
+		ok1(external_agent_operation(agent, TRANSACTION_COMMIT, "")
+		    == SUCCESS);
+		ok1(external_agent_operation(agent, CLOSE, "")
+		    == SUCCESS);
+		ntdb_close(ntdb);
+	}
+	free_external_agent(agent);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-missing-entries.c b/lib/ntdb/test/api-missing-entries.c
new file mode 100644
index 0000000000..1c8064f945
--- /dev/null
+++ b/lib/ntdb/test/api-missing-entries.c
@@ -0,0 +1,44 @@
+/* Another test revealed that we lost an entry.  This reproduces it. */
+#include "config.h"
+#include "ntdb.h"
+#include <ccan/hash/hash.h>
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1189
+
+/* We use the same seed which we saw this failure on. */
+static uint64_t failhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+	seed = 699537674708983027ULL;
+	return hash64_stable((const unsigned char *)key, len, seed);
+}
+
+int main(int argc, char *argv[])
+{
+	int i;
+	struct ntdb_context *ntdb;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
+						.fn = failhash } };
+
+	hattr.base.next = &tap_log_attr;
+	plan_tests(1 + NUM_RECORDS + 2);
+
+	ntdb = ntdb_open("run-missing-entries.ntdb", NTDB_INTERNAL,
+		       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+	if (ok1(ntdb)) {
+		for (i = 0; i < NUM_RECORDS; i++) {
+			ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
+		}
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-open-multiple-times.c b/lib/ntdb/test/api-open-multiple-times.c
new file mode 100644
index 0000000000..70bad00568
--- /dev/null
+++ b/lib/ntdb/test/api-open-multiple-times.c
@@ -0,0 +1,83 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb, *ntdb2;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 28);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-open-multiple-times.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		ntdb2 = ntdb_open("run-open-multiple-times.ntdb", flags[i],
+				O_RDWR|O_CREAT, 0600, &tap_log_attr);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(ntdb_check(ntdb2, NULL, NULL) == 0);
+
+		/* Store in one, fetch in the other. */
+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
+		ok1(ntdb_fetch(ntdb2, key, &d) == NTDB_SUCCESS);
+		ok1(ntdb_deq(d, data));
+		free(d.dptr);
+
+		/* Vice versa, with delete. */
+		ok1(ntdb_delete(ntdb2, key) == 0);
+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_NOEXIST);
+
+		/* OK, now close first one, check second still good. */
+		ok1(ntdb_close(ntdb) == 0);
+
+		ok1(ntdb_store(ntdb2, key, data, NTDB_REPLACE) == 0);
+		ok1(ntdb_fetch(ntdb2, key, &d) == NTDB_SUCCESS);
+		ok1(ntdb_deq(d, data));
+		free(d.dptr);
+
+		/* Reopen */
+		ntdb = ntdb_open("run-open-multiple-times.ntdb", flags[i],
+			       O_RDWR|O_CREAT, 0600, &tap_log_attr);
+		ok1(ntdb);
+
+		ok1(ntdb_transaction_start(ntdb2) == 0);
+
+		/* Anything in the other one should fail. */
+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 1);
+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 2);
+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 3);
+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_LOCK);
+		ok1(tap_log_messages == 4);
+
+		/* Transaciton should work as normal. */
+		ok1(ntdb_store(ntdb2, key, data, NTDB_REPLACE) == NTDB_SUCCESS);
+
+		/* Now... try closing with locks held. */
+		ok1(ntdb_close(ntdb2) == 0);
+
+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+		ok1(ntdb_deq(d, data));
+		free(d.dptr);
+		ok1(ntdb_close(ntdb) == 0);
+		ok1(tap_log_messages == 4);
+		tap_log_messages = 0;
+	}
+
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-record-expand.c b/lib/ntdb/test/api-record-expand.c
new file mode 100644
index 0000000000..cea5a10bfb
--- /dev/null
+++ b/lib/ntdb/test/api-record-expand.c
@@ -0,0 +1,51 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include "logging.h"
+
+#define MAX_SIZE 10000
+#define SIZE_STEP 131
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data;
+
+	data.dptr = malloc(MAX_SIZE);
+	memset(data.dptr, 0x24, MAX_SIZE);
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (3 + (1 + (MAX_SIZE/SIZE_STEP)) * 2) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-record-expand.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		data.dsize = 0;
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		for (data.dsize = 0;
+		     data.dsize < MAX_SIZE;
+		     data.dsize += SIZE_STEP) {
+			memset(data.dptr, data.dsize, data.dsize);
+			ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		}
+		ntdb_close(ntdb);
+	}
+	ok1(tap_log_messages == 0);
+	free(data.dptr);
+
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-simple-delete.c b/lib/ntdb/test/api-simple-delete.c
new file mode 100644
index 0000000000..2b20e199ee
--- /dev/null
+++ b/lib/ntdb/test/api-simple-delete.c
@@ -0,0 +1,39 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-simple-delete.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (ntdb) {
+			/* Delete should fail. */
+			ok1(ntdb_delete(ntdb, key) == NTDB_ERR_NOEXIST);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			/* Insert should succeed. */
+			ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			/* Delete should now work. */
+			ok1(ntdb_delete(ntdb, key) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			ntdb_close(ntdb);
+		}
+	}
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/api-summary.c b/lib/ntdb/test/api-summary.c
new file mode 100644
index 0000000000..8060ef29be
--- /dev/null
+++ b/lib/ntdb/test/api-summary.c
@@ -0,0 +1,58 @@
+#include "config.h"
+#include "ntdb.h"
+#include "tap-interface.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
+	NTDB_DATA data = { (unsigned char *)&j, sizeof(j) };
+	char *summary;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 2 * 5) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-summary.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		/* Put some stuff in there. */
+		for (j = 0; j < 500; j++) {
+			/* Make sure padding varies to we get some graphs! */
+			data.dsize = j % (sizeof(j) + 1);
+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+				fail("Storing in ntdb");
+		}
+
+		for (j = 0;
+		     j <= NTDB_SUMMARY_HISTOGRAMS;
+		     j += NTDB_SUMMARY_HISTOGRAMS) {
+			ok1(ntdb_summary(ntdb, j, &summary) == NTDB_SUCCESS);
+			ok1(strstr(summary, "Number of records: 500\n"));
+			ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n"));
+			ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n"));
+			if (j == NTDB_SUMMARY_HISTOGRAMS) {
+				ok1(strstr(summary, "|")
+				    && strstr(summary, "*"));
+			} else {
+				ok1(!strstr(summary, "|")
+				    && !strstr(summary, "*"));
+			}
+			free(summary);
+		}
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/external-agent.c b/lib/ntdb/test/external-agent.c
new file mode 100644
index 0000000000..098d0cb595
--- /dev/null
+++ b/lib/ntdb/test/external-agent.c
@@ -0,0 +1,252 @@
+#include "external-agent.h"
+#include "logging.h"
+#include "lock-tracking.h"
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <ccan/err/err.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <errno.h>
+#include "tap-interface.h"
+#include <stdio.h>
+#include <stdarg.h>
+
+static struct ntdb_context *ntdb;
+
+void (*external_agent_free)(void *) = free;
+
+static enum NTDB_ERROR clear_if_first(int fd, void *arg)
+{
+/* We hold a lock offset 4 always, so we can tell if anyone is holding it.
+ * (This is compatible with tdb's TDB_CLEAR_IF_FIRST flag).  */
+	struct flock fl;
+
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_start = 4;
+	fl.l_len = 1;
+
+	if (fcntl(fd, F_SETLK, &fl) == 0) {
+		/* We must be first ones to open it! */
+		diag("agent truncating file!");
+		if (ftruncate(fd, 0) != 0) {
+			return NTDB_ERR_IO;
+		}
+	}
+	fl.l_type = F_RDLCK;
+	if (fcntl(fd, F_SETLKW, &fl) != 0) {
+		return NTDB_ERR_IO;
+	}
+	return NTDB_SUCCESS;
+}
+
+static enum agent_return do_operation(enum operation op, const char *name)
+{
+	NTDB_DATA k;
+	enum agent_return ret;
+	NTDB_DATA data;
+	enum NTDB_ERROR ecode;
+	union ntdb_attribute cif;
+
+	if (op != OPEN && op != OPEN_WITH_HOOK && !ntdb) {
+		diag("external: No ntdb open!");
+		return OTHER_FAILURE;
+	}
+
+	diag("external: %s", operation_name(op));
+
+	k = ntdb_mkdata(name, strlen(name));
+
+	locking_would_block = 0;
+	switch (op) {
+	case OPEN:
+		if (ntdb) {
+			diag("Already have ntdb %s open", ntdb_name(ntdb));
+			return OTHER_FAILURE;
+		}
+		ntdb = ntdb_open(name, NTDB_DEFAULT, O_RDWR, 0, &tap_log_attr);
+		if (!ntdb) {
+			if (!locking_would_block)
+				diag("Opening ntdb gave %s", strerror(errno));
+			forget_locking();
+			ret = OTHER_FAILURE;
+		} else
+			ret = SUCCESS;
+		break;
+	case OPEN_WITH_HOOK:
+		if (ntdb) {
+			diag("Already have ntdb %s open", ntdb_name(ntdb));
+			return OTHER_FAILURE;
+		}
+		cif.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
+		cif.openhook.base.next = &tap_log_attr;
+		cif.openhook.fn = clear_if_first;
+		ntdb = ntdb_open(name, NTDB_DEFAULT, O_RDWR, 0, &cif);
+		if (!ntdb) {
+			if (!locking_would_block)
+				diag("Opening ntdb gave %s", strerror(errno));
+			forget_locking();
+			ret = OTHER_FAILURE;
+		} else
+			ret = SUCCESS;
+		break;
+	case FETCH:
+		ecode = ntdb_fetch(ntdb, k, &data);
+		if (ecode == NTDB_ERR_NOEXIST) {
+			ret = FAILED;
+		} else if (ecode < 0) {
+			ret = OTHER_FAILURE;
+		} else if (!ntdb_deq(data, k)) {
+			ret = OTHER_FAILURE;
+			external_agent_free(data.dptr);
+		} else {
+			ret = SUCCESS;
+			external_agent_free(data.dptr);
+		}
+		break;
+	case STORE:
+		ret = ntdb_store(ntdb, k, k, 0) == 0 ? SUCCESS : OTHER_FAILURE;
+		break;
+	case TRANSACTION_START:
+		ret = ntdb_transaction_start(ntdb) == 0 ? SUCCESS : OTHER_FAILURE;
+		break;
+	case TRANSACTION_COMMIT:
+		ret = ntdb_transaction_commit(ntdb)==0 ? SUCCESS : OTHER_FAILURE;
+		break;
+	case NEEDS_RECOVERY:
+		ret = external_agent_needs_rec(ntdb);
+		break;
+	case CHECK:
+		ret = ntdb_check(ntdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE;
+		break;
+	case CLOSE:
+		ret = ntdb_close(ntdb) == 0 ? SUCCESS : OTHER_FAILURE;
+		ntdb = NULL;
+		break;
+	case SEND_SIGNAL:
+		/* We do this async */
+		ret = SUCCESS;
+		break;
+	default:
+		ret = OTHER_FAILURE;
+	}
+
+	if (locking_would_block)
+		ret = WOULD_HAVE_BLOCKED;
+
+	return ret;
+}
+
+struct agent {
+	int cmdfd, responsefd;
+};
+
+/* Do this before doing any ntdb stuff.  Return handle, or NULL. */
+struct agent *prepare_external_agent(void)
+{
+	int pid, ret;
+	int command[2], response[2];
+	char name[1+PATH_MAX];
+
+	if (pipe(command) != 0 || pipe(response) != 0)
+		return NULL;
+
+	pid = fork();
+	if (pid < 0)
+		return NULL;
+
+	if (pid != 0) {
+		struct agent *agent = malloc(sizeof(*agent));
+
+		close(command[0]);
+		close(response[1]);
+		agent->cmdfd = command[1];
+		agent->responsefd = response[0];
+		return agent;
+	}
+
+	close(command[1]);
+	close(response[0]);
+
+	/* We want to fail, not block. */
+	nonblocking_locks = true;
+	log_prefix = "external: ";
+	while ((ret = read(command[0], name, sizeof(name))) > 0) {
+		enum agent_return result;
+
+		result = do_operation(name[0], name+1);
+		if (write(response[1], &result, sizeof(result))
+		    != sizeof(result))
+			err(1, "Writing response");
+		if (name[0] == SEND_SIGNAL) {
+			struct timeval ten_ms;
+			ten_ms.tv_sec = 0;
+			ten_ms.tv_usec = 10000;
+			select(0, NULL, NULL, NULL, &ten_ms);
+			kill(getppid(), SIGUSR1);
+		}
+	}
+	exit(0);
+}
+
+/* Ask the external agent to try to do an operation. */
+enum agent_return external_agent_operation(struct agent *agent,
+					   enum operation op,
+					   const char *name)
+{
+	enum agent_return res;
+	unsigned int len;
+	char *string;
+
+	if (!name)
+		name = "";
+	len = 1 + strlen(name) + 1;
+	string = malloc(len);
+
+	string[0] = op;
+	strcpy(string+1, name);
+
+	if (write(agent->cmdfd, string, len) != len
+	    || read(agent->responsefd, &res, sizeof(res)) != sizeof(res))
+		res = AGENT_DIED;
+
+	free(string);
+	return res;
+}
+
+const char *agent_return_name(enum agent_return ret)
+{
+	return ret == SUCCESS ? "SUCCESS"
+		: ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED"
+		: ret == AGENT_DIED ? "AGENT_DIED"
+		: ret == FAILED ? "FAILED"
+		: ret == OTHER_FAILURE ? "OTHER_FAILURE"
+		: "**INVALID**";
+}
+
+const char *operation_name(enum operation op)
+{
+	switch (op) {
+	case OPEN: return "OPEN";
+	case OPEN_WITH_HOOK: return "OPEN_WITH_HOOK";
+	case FETCH: return "FETCH";
+	case STORE: return "STORE";
+	case CHECK: return "CHECK";
+	case TRANSACTION_START: return "TRANSACTION_START";
+	case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT";
+	case NEEDS_RECOVERY: return "NEEDS_RECOVERY";
+	case SEND_SIGNAL: return "SEND_SIGNAL";
+	case CLOSE: return "CLOSE";
+	}
+	return "**INVALID**";
+}
+
+void free_external_agent(struct agent *agent)
+{
+	close(agent->cmdfd);
+	close(agent->responsefd);
+	free(agent);
+}
diff --git a/lib/ntdb/test/external-agent.h b/lib/ntdb/test/external-agent.h
new file mode 100644
index 0000000000..c6b83d5b49
--- /dev/null
+++ b/lib/ntdb/test/external-agent.h
@@ -0,0 +1,51 @@
+#ifndef NTDB_TEST_EXTERNAL_AGENT_H
+#define NTDB_TEST_EXTERNAL_AGENT_H
+
+/* For locking tests, we need a different process to try things at
+ * various times. */
+enum operation {
+	OPEN,
+	OPEN_WITH_HOOK,
+	FETCH,
+	STORE,
+	TRANSACTION_START,
+	TRANSACTION_COMMIT,
+	NEEDS_RECOVERY,
+	CHECK,
+	SEND_SIGNAL,
+	CLOSE,
+};
+
+/* Do this before doing any ntdb stuff.  Return handle, or -1. */
+struct agent *prepare_external_agent(void);
+
+enum agent_return {
+	SUCCESS,
+	WOULD_HAVE_BLOCKED,
+	AGENT_DIED,
+	FAILED, /* For fetch, or NEEDS_RECOVERY */
+	OTHER_FAILURE,
+};
+
+/* Ask the external agent to try to do an operation.
+ * name == ntdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST,
+ * record name for FETCH/STORE (store stores name as data too)
+ */
+enum agent_return external_agent_operation(struct agent *handle,
+					   enum operation op,
+					   const char *name);
+
+/* Hook into free() on ntdb_data in external agent. */
+extern void (*external_agent_free)(void *);
+
+/* Mapping enum -> string. */
+const char *agent_return_name(enum agent_return ret);
+const char *operation_name(enum operation op);
+
+void free_external_agent(struct agent *agent);
+
+/* Internal use: */
+struct ntdb_context;
+enum agent_return external_agent_needs_rec(struct ntdb_context *ntdb);
+
+#endif /* NTDB_TEST_EXTERNAL_AGENT_H */
diff --git a/lib/ntdb/test/failtest_helper.c b/lib/ntdb/test/failtest_helper.c
new file mode 100644
index 0000000000..cc110919c3
--- /dev/null
+++ b/lib/ntdb/test/failtest_helper.c
@@ -0,0 +1,96 @@
+#include "failtest_helper.h"
+#include "logging.h"
+#include <string.h>
+#include "tap-interface.h"
+
+bool failtest_suppress = false;
+
+/* FIXME: From ccan/str */
+static inline bool strends(const char *str, const char *postfix)
+{
+	if (strlen(str) < strlen(postfix))
+		return false;
+
+	return !strcmp(str + strlen(str) - strlen(postfix), postfix);
+}
+
+bool failmatch(const struct failtest_call *call,
+	       const char *file, int line, enum failtest_call_type type)
+{
+	return call->type == type
+		&& call->line == line
+		&& ((strcmp(call->file, file) == 0)
+		    || (strends(call->file, file)
+			&& (call->file[strlen(call->file) - strlen(file) - 1]
+			    == '/')));
+}
+
+static bool is_nonblocking_lock(const struct failtest_call *call)
+{
+	return call->type == FAILTEST_FCNTL && call->u.fcntl.cmd == F_SETLK;
+}
+
+static bool is_unlock(const struct failtest_call *call)
+{
+	return call->type == FAILTEST_FCNTL
+		&& call->u.fcntl.arg.fl.l_type == F_UNLCK;
+}
+
+bool exit_check_log(struct tlist_calls *history)
+{
+	const struct failtest_call *i;
+
+	tlist_for_each(history, i, list) {
+		if (!i->fail)
+			continue;
+		/* Failing the /dev/urandom open doesn't count: we fall back. */
+		if (failmatch(i, URANDOM_OPEN))
+			continue;
+
+		/* Similarly with read fail. */
+		if (failmatch(i, URANDOM_READ))
+			continue;
+
+		/* Initial allocation of ntdb doesn't log. */
+		if (failmatch(i, INITIAL_NTDB_MALLOC))
+			continue;
+
+		/* We don't block "failures" on non-blocking locks. */
+		if (is_nonblocking_lock(i))
+			continue;
+
+		if (!tap_log_messages)
+			diag("We didn't log for %s:%u", i->file, i->line);
+		return tap_log_messages != 0;
+	}
+	return true;
+}
+
+/* Some places we soldier on despite errors: only fail them once. */
+enum failtest_result
+block_repeat_failures(struct tlist_calls *history)
+{
+	const struct failtest_call *last;
+
+	last = tlist_tail(history, list);
+
+	if (failtest_suppress)
+		return FAIL_DONT_FAIL;
+
+	if (failmatch(last, INITIAL_NTDB_MALLOC)
+	    || failmatch(last, URANDOM_OPEN)
+	    || failmatch(last, URANDOM_READ)) {
+		return FAIL_PROBE;
+	}
+
+	/* We handle mmap failing, by falling back to read/write, so
+	 * don't try all possible paths. */
+	if (last->type == FAILTEST_MMAP)
+		return FAIL_PROBE;
+
+	/* Unlock or non-blocking lock is fail-once. */
+	if (is_unlock(last) || is_nonblocking_lock(last))
+		return FAIL_PROBE;
+
+	return FAIL_OK;
+}
diff --git a/lib/ntdb/test/failtest_helper.h b/lib/ntdb/test/failtest_helper.h
new file mode 100644
index 0000000000..e754636402
--- /dev/null
+++ b/lib/ntdb/test/failtest_helper.h
@@ -0,0 +1,19 @@
+#ifndef NTDB_TEST_FAILTEST_HELPER_H
+#define NTDB_TEST_FAILTEST_HELPER_H
+#include <ccan/failtest/failtest.h>
+#include <stdbool.h>
+
+/* FIXME: Check these! */
+#define INITIAL_NTDB_MALLOC	"open.c", 403, FAILTEST_MALLOC
+#define URANDOM_OPEN		"open.c", 62, FAILTEST_OPEN
+#define URANDOM_READ		"open.c", 42, FAILTEST_READ
+
+bool exit_check_log(struct tlist_calls *history);
+bool failmatch(const struct failtest_call *call,
+	       const char *file, int line, enum failtest_call_type type);
+enum failtest_result block_repeat_failures(struct tlist_calls *history);
+
+/* Set this to suppress failure. */
+extern bool failtest_suppress;
+
+#endif /* NTDB_TEST_LOGGING_H */
diff --git a/lib/ntdb/test/helpapi-external-agent.c b/lib/ntdb/test/helpapi-external-agent.c
new file mode 100644
index 0000000000..eb81399072
--- /dev/null
+++ b/lib/ntdb/test/helpapi-external-agent.c
@@ -0,0 +1,7 @@
+#include "external-agent.h"
+
+/* This isn't possible with via the ntdb API, but this makes it link. */
+enum agent_return external_agent_needs_rec(struct ntdb_context *ntdb)
+{
+	return FAILED;
+}
diff --git a/lib/ntdb/test/helprun-external-agent.c b/lib/ntdb/test/helprun-external-agent.c
new file mode 100644
index 0000000000..81a3fe881d
--- /dev/null
+++ b/lib/ntdb/test/helprun-external-agent.c
@@ -0,0 +1,7 @@
+#include "external-agent.h"
+#include "private.h"
+
+enum agent_return external_agent_needs_rec(struct ntdb_context *ntdb)
+{
+	return ntdb_needs_recovery(ntdb) ? SUCCESS : FAILED;
+}
diff --git a/lib/ntdb/test/helprun-layout.c b/lib/ntdb/test/helprun-layout.c
new file mode 100644
index 0000000000..c8f1fd03c4
--- /dev/null
+++ b/lib/ntdb/test/helprun-layout.c
@@ -0,0 +1,402 @@
+/* NTDB tools to create various canned database layouts. */
+#include "layout.h"
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <ccan/err/err.h>
+#include "logging.h"
+
+struct ntdb_layout *new_ntdb_layout(void)
+{
+	struct ntdb_layout *layout = malloc(sizeof(*layout));
+	layout->num_elems = 0;
+	layout->elem = NULL;
+	return layout;
+}
+
+static void add(struct ntdb_layout *layout, union ntdb_layout_elem elem)
+{
+	layout->elem = realloc(layout->elem,
+			       sizeof(layout->elem[0])
+			       * (layout->num_elems+1));
+	layout->elem[layout->num_elems++] = elem;
+}
+
+void ntdb_layout_add_freetable(struct ntdb_layout *layout)
+{
+	union ntdb_layout_elem elem;
+	elem.base.type = FREETABLE;
+	add(layout, elem);
+}
+
+void ntdb_layout_add_free(struct ntdb_layout *layout, ntdb_len_t len,
+			 unsigned ftable)
+{
+	union ntdb_layout_elem elem;
+	elem.base.type = FREE;
+	elem.free.len = len;
+	elem.free.ftable_num = ftable;
+	add(layout, elem);
+}
+
+void ntdb_layout_add_capability(struct ntdb_layout *layout,
+			       uint64_t type,
+			       bool write_breaks,
+			       bool check_breaks,
+			       bool open_breaks,
+			       ntdb_len_t extra)
+{
+	union ntdb_layout_elem elem;
+	elem.base.type = CAPABILITY;
+	elem.capability.type = type;
+	if (write_breaks)
+		elem.capability.type |= NTDB_CAP_NOWRITE;
+	if (open_breaks)
+		elem.capability.type |= NTDB_CAP_NOOPEN;
+	if (check_breaks)
+		elem.capability.type |= NTDB_CAP_NOCHECK;
+	elem.capability.extra = extra;
+	add(layout, elem);
+}
+
+static NTDB_DATA dup_key(NTDB_DATA key)
+{
+	NTDB_DATA ret;
+	ret.dsize = key.dsize;
+	ret.dptr = malloc(ret.dsize);
+	memcpy(ret.dptr, key.dptr, ret.dsize);
+	return ret;
+}
+
+void ntdb_layout_add_used(struct ntdb_layout *layout,
+			 NTDB_DATA key, NTDB_DATA data,
+			 ntdb_len_t extra)
+{
+	union ntdb_layout_elem elem;
+	elem.base.type = DATA;
+	elem.used.key = dup_key(key);
+	elem.used.data = dup_key(data);
+	elem.used.extra = extra;
+	add(layout, elem);
+}
+
+static ntdb_len_t free_record_len(ntdb_len_t len)
+{
+	return sizeof(struct ntdb_used_record) + len;
+}
+
+static ntdb_len_t data_record_len(struct tle_used *used)
+{
+	ntdb_len_t len;
+	len = sizeof(struct ntdb_used_record)
+		+ used->key.dsize + used->data.dsize + used->extra;
+	assert(len >= sizeof(struct ntdb_free_record));
+	return len;
+}
+
+static ntdb_len_t hashtable_len(struct tle_hashtable *htable)
+{
+	return sizeof(struct ntdb_used_record)
+		+ (sizeof(ntdb_off_t) << NTDB_SUBLEVEL_HASH_BITS)
+		+ htable->extra;
+}
+
+static ntdb_len_t capability_len(struct tle_capability *cap)
+{
+	return sizeof(struct ntdb_capability) + cap->extra;
+}
+
+static ntdb_len_t freetable_len(struct tle_freetable *ftable)
+{
+	return sizeof(struct ntdb_freetable);
+}
+
+static void set_free_record(void *mem, ntdb_len_t len)
+{
+	/* We do all the work in add_to_freetable */
+}
+
+static void add_zero_pad(struct ntdb_used_record *u, size_t len, size_t extra)
+{
+	if (extra)
+		((char *)(u + 1))[len] = '\0';
+}
+
+static void set_data_record(void *mem, struct ntdb_context *ntdb,
+			    struct tle_used *used)
+{
+	struct ntdb_used_record *u = mem;
+
+	set_header(ntdb, u, NTDB_USED_MAGIC, used->key.dsize, used->data.dsize,
+		   used->key.dsize + used->data.dsize + used->extra,
+		   ntdb_hash(ntdb, used->key.dptr, used->key.dsize));
+	memcpy(u + 1, used->key.dptr, used->key.dsize);
+	memcpy((char *)(u + 1) + used->key.dsize,
+	       used->data.dptr, used->data.dsize);
+	add_zero_pad(u, used->key.dsize + used->data.dsize, used->extra);
+}
+
+static void set_hashtable(void *mem, struct ntdb_context *ntdb,
+			  struct tle_hashtable *htable)
+{
+	struct ntdb_used_record *u = mem;
+	ntdb_len_t len = sizeof(ntdb_off_t) << NTDB_SUBLEVEL_HASH_BITS;
+
+	set_header(ntdb, u, NTDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
+	memset(u + 1, 0, len);
+	add_zero_pad(u, len, htable->extra);
+}
+
+static void set_capability(void *mem, struct ntdb_context *ntdb,
+			   struct tle_capability *cap, struct ntdb_header *hdr,
+			   ntdb_off_t last_cap)
+{
+	struct ntdb_capability *c = mem;
+	ntdb_len_t len = sizeof(*c) - sizeof(struct ntdb_used_record) + cap->extra;
+
+	c->type = cap->type;
+	c->next = 0;
+	set_header(ntdb, &c->hdr, NTDB_CAP_MAGIC, 0, len, len, 0);
+
+	/* Append to capability list. */
+	if (!last_cap) {
+		hdr->capabilities = cap->base.off;
+	} else {
+		c = (struct ntdb_capability *)((char *)hdr + last_cap);
+		c->next = cap->base.off;
+	}
+}
+
+static void set_freetable(void *mem, struct ntdb_context *ntdb,
+			 struct tle_freetable *freetable, struct ntdb_header *hdr,
+			 ntdb_off_t last_ftable)
+{
+	struct ntdb_freetable *ftable = mem;
+	memset(ftable, 0, sizeof(*ftable));
+	set_header(ntdb, &ftable->hdr, NTDB_FTABLE_MAGIC, 0,
+			sizeof(*ftable) - sizeof(ftable->hdr),
+			sizeof(*ftable) - sizeof(ftable->hdr), 0);
+
+	if (last_ftable) {
+		ftable = (struct ntdb_freetable *)((char *)hdr + last_ftable);
+		ftable->next = freetable->base.off;
+	} else {
+		hdr->free_table = freetable->base.off;
+	}
+}
+
+static void add_to_freetable(struct ntdb_context *ntdb,
+			     ntdb_off_t eoff,
+			     ntdb_off_t elen,
+			     unsigned ftable,
+			     struct tle_freetable *freetable)
+{
+	ntdb->ftable_off = freetable->base.off;
+	ntdb->ftable = ftable;
+	add_free_record(ntdb, eoff, sizeof(struct ntdb_used_record) + elen,
+			NTDB_LOCK_WAIT, false);
+}
+
+static ntdb_off_t hbucket_off(ntdb_off_t group_start, unsigned ingroup)
+{
+	return group_start
+		+ (ingroup % (1 << NTDB_HASH_GROUP_BITS)) * sizeof(ntdb_off_t);
+}
+
+/* Get bits from a value. */
+static uint32_t bits(uint64_t val, unsigned start, unsigned num)
+{
+	assert(num <= 32);
+	return (val >> start) & ((1U << num) - 1);
+}
+
+/* We take bits from the top: that way we can lock whole sections of the hash
+ * by using lock ranges. */
+static uint32_t use_bits(uint64_t h, unsigned num, unsigned *used)
+{
+	*used += num;
+	return bits(h, 64 - *used, num);
+}
+
+static ntdb_off_t encode_offset(ntdb_off_t new_off, unsigned bucket,
+			       uint64_t h)
+{
+	return bucket
+		| new_off
+		| ((uint64_t)bits(h, 64 - NTDB_OFF_UPPER_STEAL_EXTRA,
+				  NTDB_OFF_UPPER_STEAL_EXTRA)
+		   << NTDB_OFF_HASH_EXTRA_BIT);
+}
+
+/* FIXME: Our hash table handling here is primitive: we don't expand! */
+static void add_to_hashtable(struct ntdb_context *ntdb,
+			     ntdb_off_t eoff,
+			     NTDB_DATA key)
+{
+	uint64_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
+	ntdb_off_t b_off, group_start;
+	unsigned i, group, in_group;
+	unsigned used = 0;
+
+	group = use_bits(h, NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS, &used);
+	in_group = use_bits(h, NTDB_HASH_GROUP_BITS, &used);
+
+	group_start = offsetof(struct ntdb_header, hashtable)
+		+ group * (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS);
+
+	for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) {
+		unsigned bucket = (in_group + i) % (1 << NTDB_HASH_GROUP_BITS);
+
+		b_off = hbucket_off(group_start, bucket);
+		if (ntdb_read_off(ntdb, b_off) == 0) {
+			ntdb_write_off(ntdb, b_off,
+				      encode_offset(eoff, in_group, h));
+			return;
+		}
+	}
+	abort();
+}
+
+static struct tle_freetable *find_ftable(struct ntdb_layout *layout, unsigned num)
+{
+	unsigned i;
+
+	for (i = 0; i < layout->num_elems; i++) {
+		if (layout->elem[i].base.type != FREETABLE)
+			continue;
+		if (num == 0)
+			return &layout->elem[i].ftable;
+		num--;
+	}
+	abort();
+}
+
+/* FIXME: Support NTDB_CONVERT */
+struct ntdb_context *ntdb_layout_get(struct ntdb_layout *layout,
+				   void (*freefn)(void *),
+				   union ntdb_attribute *attr)
+{
+	unsigned int i;
+	ntdb_off_t off, len, last_ftable, last_cap;
+	char *mem;
+	struct ntdb_context *ntdb;
+
+	off = sizeof(struct ntdb_header);
+
+	/* First pass of layout: calc lengths */
+	for (i = 0; i < layout->num_elems; i++) {
+		union ntdb_layout_elem *e = &layout->elem[i];
+		e->base.off = off;
+		switch (e->base.type) {
+		case FREETABLE:
+			len = freetable_len(&e->ftable);
+			break;
+		case FREE:
+			len = free_record_len(e->free.len);
+			break;
+		case DATA:
+			len = data_record_len(&e->used);
+			break;
+		case HASHTABLE:
+			len = hashtable_len(&e->hashtable);
+			break;
+		case CAPABILITY:
+			len = capability_len(&e->capability);
+			break;
+		default:
+			abort();
+		}
+		off += len;
+	}
+
+	mem = malloc(off);
+	/* Fill with some weird pattern. */
+	memset(mem, 0x99, off);
+	/* Now populate our header, cribbing from a real NTDB header. */
+	ntdb = ntdb_open(NULL, NTDB_INTERNAL, O_RDWR, 0, attr);
+	memcpy(mem, ntdb->file->map_ptr, sizeof(struct ntdb_header));
+
+	/* Mug the ntdb we have to make it use this. */
+	freefn(ntdb->file->map_ptr);
+	ntdb->file->map_ptr = mem;
+	ntdb->file->map_size = off;
+
+	last_ftable = 0;
+	last_cap = 0;
+	for (i = 0; i < layout->num_elems; i++) {
+		union ntdb_layout_elem *e = &layout->elem[i];
+		switch (e->base.type) {
+		case FREETABLE:
+			set_freetable(mem + e->base.off, ntdb, &e->ftable,
+				     (struct ntdb_header *)mem, last_ftable);
+			last_ftable = e->base.off;
+			break;
+		case FREE:
+			set_free_record(mem + e->base.off, e->free.len);
+			break;
+		case DATA:
+			set_data_record(mem + e->base.off, ntdb, &e->used);
+			break;
+		case HASHTABLE:
+			set_hashtable(mem + e->base.off, ntdb, &e->hashtable);
+			break;
+		case CAPABILITY:
+			set_capability(mem + e->base.off, ntdb, &e->capability,
+				       (struct ntdb_header *)mem, last_cap);
+			last_cap = e->base.off;
+			break;
+		}
+	}
+	/* Must have a free table! */
+	assert(last_ftable);
+
+	/* Now fill the free and hash tables. */
+	for (i = 0; i < layout->num_elems; i++) {
+		union ntdb_layout_elem *e = &layout->elem[i];
+		switch (e->base.type) {
+		case FREE:
+			add_to_freetable(ntdb, e->base.off, e->free.len,
+					 e->free.ftable_num,
+					 find_ftable(layout, e->free.ftable_num));
+			break;
+		case DATA:
+			add_to_hashtable(ntdb, e->base.off, e->used.key);
+			break;
+		default:
+			break;
+		}
+	}
+
+	ntdb->ftable_off = find_ftable(layout, 0)->base.off;
+	return ntdb;
+}
+
+void ntdb_layout_write(struct ntdb_layout *layout, void (*freefn)(void *),
+		       union ntdb_attribute *attr, const char *filename)
+{
+	struct ntdb_context *ntdb = ntdb_layout_get(layout, freefn, attr);
+	int fd;
+
+	fd = open(filename, O_WRONLY|O_TRUNC|O_CREAT,  0600);
+	if (fd < 0)
+		err(1, "opening %s for writing", filename);
+	if (write(fd, ntdb->file->map_ptr, ntdb->file->map_size)
+	    != ntdb->file->map_size)
+		err(1, "writing %s", filename);
+	close(fd);
+	ntdb_close(ntdb);
+}
+
+void ntdb_layout_free(struct ntdb_layout *layout)
+{
+	unsigned int i;
+
+	for (i = 0; i < layout->num_elems; i++) {
+		if (layout->elem[i].base.type == DATA) {
+			free(layout->elem[i].used.key.dptr);
+			free(layout->elem[i].used.data.dptr);
+		}
+	}
+	free(layout->elem);
+	free(layout);
+}
diff --git a/lib/ntdb/test/layout.h b/lib/ntdb/test/layout.h
new file mode 100644
index 0000000000..bcd20b8965
--- /dev/null
+++ b/lib/ntdb/test/layout.h
@@ -0,0 +1,87 @@
+#ifndef NTDB_TEST_LAYOUT_H
+#define NTDB_TEST_LAYOUT_H
+#include "private.h"
+
+struct ntdb_layout *new_ntdb_layout(void);
+void ntdb_layout_add_freetable(struct ntdb_layout *layout);
+void ntdb_layout_add_free(struct ntdb_layout *layout, ntdb_len_t len,
+			 unsigned ftable);
+void ntdb_layout_add_used(struct ntdb_layout *layout,
+			 NTDB_DATA key, NTDB_DATA data,
+			 ntdb_len_t extra);
+void ntdb_layout_add_capability(struct ntdb_layout *layout,
+			       uint64_t type,
+			       bool write_breaks,
+			       bool check_breaks,
+			       bool open_breaks,
+			       ntdb_len_t extra);
+
+#if 0 /* FIXME: Allow allocation of subtables */
+void ntdb_layout_add_hashtable(struct ntdb_layout *layout,
+			      int htable_parent, /* -1 == toplevel */
+			      unsigned int bucket,
+			      ntdb_len_t extra);
+#endif
+/* freefn is needed if we're using failtest_free. */
+struct ntdb_context *ntdb_layout_get(struct ntdb_layout *layout,
+				   void (*freefn)(void *),
+				   union ntdb_attribute *attr);
+void ntdb_layout_write(struct ntdb_layout *layout, void (*freefn)(void *),
+		       union ntdb_attribute *attr, const char *filename);
+
+void ntdb_layout_free(struct ntdb_layout *layout);
+
+enum layout_type {
+	FREETABLE, FREE, DATA, HASHTABLE, CAPABILITY
+};
+
+/* Shared by all union members. */
+struct tle_base {
+	enum layout_type type;
+	ntdb_off_t off;
+};
+
+struct tle_freetable {
+	struct tle_base base;
+};
+
+struct tle_free {
+	struct tle_base base;
+	ntdb_len_t len;
+	unsigned ftable_num;
+};
+
+struct tle_used {
+	struct tle_base base;
+	NTDB_DATA key;
+	NTDB_DATA data;
+	ntdb_len_t extra;
+};
+
+struct tle_hashtable {
+	struct tle_base base;
+	int parent;
+	unsigned int bucket;
+	ntdb_len_t extra;
+};
+
+struct tle_capability {
+	struct tle_base base;
+	uint64_t type;
+	ntdb_len_t extra;
+};
+
+union ntdb_layout_elem {
+	struct tle_base base;
+	struct tle_freetable ftable;
+	struct tle_free free;
+	struct tle_used used;
+	struct tle_hashtable hashtable;
+	struct tle_capability capability;
+};
+
+struct ntdb_layout {
+	unsigned int num_elems;
+	union ntdb_layout_elem *elem;
+};
+#endif /* NTDB_TEST_LAYOUT_H */
diff --git a/lib/ntdb/test/lock-tracking.c b/lib/ntdb/test/lock-tracking.c
new file mode 100644
index 0000000000..525a5c4ca7
--- /dev/null
+++ b/lib/ntdb/test/lock-tracking.c
@@ -0,0 +1,147 @@
+/* We save the locks so we can reaquire them. */
+#include "private.h" /* For NTDB_HASH_LOCK_START, etc. */
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include "tap-interface.h"
+#include "lock-tracking.h"
+
+struct lock {
+	struct lock *next;
+	unsigned int off;
+	unsigned int len;
+	int type;
+};
+static struct lock *locks;
+int locking_errors = 0;
+bool suppress_lockcheck = false;
+bool nonblocking_locks;
+int locking_would_block = 0;
+void (*unlock_callback)(int fd);
+
+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ )
+{
+	va_list ap;
+	int ret, arg3;
+	struct flock *fl;
+	bool may_block = false;
+
+	if (cmd != F_SETLK && cmd != F_SETLKW) {
+		/* This may be totally bogus, but we don't know in general. */
+		va_start(ap, cmd);
+		arg3 = va_arg(ap, int);
+		va_end(ap);
+
+		return fcntl(fd, cmd, arg3);
+	}
+
+	va_start(ap, cmd);
+	fl = va_arg(ap, struct flock *);
+	va_end(ap);
+
+	if (cmd == F_SETLKW && nonblocking_locks) {
+		cmd = F_SETLK;
+		may_block = true;
+	}
+	ret = fcntl(fd, cmd, fl);
+
+	/* Detect when we failed, but might have been OK if we waited. */
+	if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) {
+		locking_would_block++;
+	}
+
+	if (fl->l_type == F_UNLCK) {
+		struct lock **l;
+		struct lock *old = NULL;
+
+		for (l = &locks; *l; l = &(*l)->next) {
+			if ((*l)->off == fl->l_start
+			    && (*l)->len == fl->l_len) {
+				if (ret == 0) {
+					old = *l;
+					*l = (*l)->next;
+					free(old);
+				}
+				break;
+			}
+		}
+		if (!old && !suppress_lockcheck) {
+			diag("Unknown unlock %u@%u - %i",
+			     (int)fl->l_len, (int)fl->l_start, ret);
+			locking_errors++;
+		}
+	} else {
+		struct lock *new, *i;
+		unsigned int fl_end = fl->l_start + fl->l_len;
+		if (fl->l_len == 0)
+			fl_end = (unsigned int)-1;
+
+		/* Check for overlaps: we shouldn't do this. */
+		for (i = locks; i; i = i->next) {
+			unsigned int i_end = i->off + i->len;
+			if (i->len == 0)
+				i_end = (unsigned int)-1;
+
+			if (fl->l_start >= i->off && fl->l_start < i_end)
+				break;
+			if (fl_end > i->off && fl_end < i_end)
+				break;
+
+			/* ntdb_allrecord_lock does this, handle adjacent: */
+			if (fl->l_start > NTDB_HASH_LOCK_START
+			    && fl->l_start == i_end && fl->l_type == i->type) {
+				if (ret == 0) {
+					i->len = fl->l_len
+						? i->len + fl->l_len
+						: 0;
+				}
+				goto done;
+			}
+		}
+		if (i) {
+			/* Special case: upgrade of allrecord lock. */
+			if (i->type == F_RDLCK && fl->l_type == F_WRLCK
+			    && i->off == NTDB_HASH_LOCK_START
+			    && fl->l_start == NTDB_HASH_LOCK_START
+			    && i->len == 0
+			    && fl->l_len == 0) {
+				if (ret == 0)
+					i->type = F_WRLCK;
+				goto done;
+			}
+			if (!suppress_lockcheck) {
+				diag("%s lock %u@%u overlaps %u@%u",
+				     fl->l_type == F_WRLCK ? "write" : "read",
+				     (int)fl->l_len, (int)fl->l_start,
+				     i->len, (int)i->off);
+				locking_errors++;
+			}
+		}
+
+		if (ret == 0) {
+			new = malloc(sizeof *new);
+			new->off = fl->l_start;
+			new->len = fl->l_len;
+			new->type = fl->l_type;
+			new->next = locks;
+			locks = new;
+		}
+	}
+done:
+	if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback)
+		unlock_callback(fd);
+	return ret;
+}
+
+unsigned int forget_locking(void)
+{
+	unsigned int num = 0;
+	while (locks) {
+		struct lock *next = locks->next;
+		free(locks);
+		locks = next;
+		num++;
+	}
+	return num;
+}
diff --git a/lib/ntdb/test/lock-tracking.h b/lib/ntdb/test/lock-tracking.h
new file mode 100644
index 0000000000..f2c9c44653
--- /dev/null
+++ b/lib/ntdb/test/lock-tracking.h
@@ -0,0 +1,25 @@
+#ifndef LOCK_TRACKING_H
+#define LOCK_TRACKING_H
+#include <stdbool.h>
+
+/* Set this if you want a callback after fnctl unlock. */
+extern void (*unlock_callback)(int fd);
+
+/* Replacement fcntl. */
+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ );
+
+/* Discard locking info: returns number of locks outstanding. */
+unsigned int forget_locking(void);
+
+/* Number of errors in locking. */
+extern int locking_errors;
+
+/* Suppress lock checking. */
+extern bool suppress_lockcheck;
+
+/* Make all locks non-blocking. */
+extern bool nonblocking_locks;
+
+/* Number of times we failed a lock because we made it non-blocking. */
+extern int locking_would_block;
+#endif /* LOCK_TRACKING_H */
diff --git a/lib/ntdb/test/logging.c b/lib/ntdb/test/logging.c
new file mode 100644
index 0000000000..2819dd7cad
--- /dev/null
+++ b/lib/ntdb/test/logging.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "tap-interface.h"
+#include "logging.h"
+
+unsigned tap_log_messages;
+const char *log_prefix = "";
+char *log_last = NULL;
+bool suppress_logging;
+
+union ntdb_attribute tap_log_attr = {
+	.log = { .base = { .attr = NTDB_ATTRIBUTE_LOG },
+		 .fn = tap_log_fn }
+};
+
+void tap_log_fn(struct ntdb_context *ntdb,
+		enum ntdb_log_level level,
+		enum NTDB_ERROR ecode,
+		const char *message, void *priv)
+{
+	if (suppress_logging)
+		return;
+
+	diag("ntdb log level %u: %s: %s%s",
+	     level, ntdb_errorstr(ecode), log_prefix, message);
+	if (log_last)
+		free(log_last);
+	log_last = strdup(message);
+	tap_log_messages++;
+}
diff --git a/lib/ntdb/test/logging.h b/lib/ntdb/test/logging.h
new file mode 100644
index 0000000000..0336ccaba3
--- /dev/null
+++ b/lib/ntdb/test/logging.h
@@ -0,0 +1,17 @@
+#ifndef NTDB_TEST_LOGGING_H
+#define NTDB_TEST_LOGGING_H
+#include "ntdb.h"
+#include <stdbool.h>
+#include <string.h>
+
+extern bool suppress_logging;
+extern const char *log_prefix;
+extern unsigned tap_log_messages;
+extern union ntdb_attribute tap_log_attr;
+extern char *log_last;
+
+void tap_log_fn(struct ntdb_context *ntdb,
+		enum ntdb_log_level level,
+		enum NTDB_ERROR ecode,
+		const char *message, void *priv);
+#endif /* NTDB_TEST_LOGGING_H */
diff --git a/lib/ntdb/test/ntdb-source.h b/lib/ntdb/test/ntdb-source.h
new file mode 100644
index 0000000000..52268440d2
--- /dev/null
+++ b/lib/ntdb/test/ntdb-source.h
@@ -0,0 +1,11 @@
+#include "config.h"
+#include "check.c"
+#include "free.c"
+#include "hash.c"
+#include "io.c"
+#include "lock.c"
+#include "open.c"
+#include "summary.c"
+#include "ntdb.c"
+#include "transaction.c"
+#include "traverse.c"
diff --git a/lib/ntdb/test/run-001-encode.c b/lib/ntdb/test/run-001-encode.c
new file mode 100644
index 0000000000..12965676a2
--- /dev/null
+++ b/lib/ntdb/test/run-001-encode.c
@@ -0,0 +1,41 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_used_record rec;
+	struct ntdb_context ntdb = { .log_fn = tap_log_fn };
+
+	plan_tests(64 + 32 + 48*6 + 1);
+
+	/* We should be able to encode any data value. */
+	for (i = 0; i < 64; i++)
+		ok1(set_header(&ntdb, &rec, NTDB_USED_MAGIC, 0, 1ULL << i,
+			       1ULL << i, 0) == 0);
+
+	/* And any key and data with < 64 bits between them. */
+	for (i = 0; i < 32; i++) {
+		ntdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
+		ok1(set_header(&ntdb, &rec, NTDB_USED_MAGIC, klen, dlen,
+			       klen + dlen, 0)  == 0);
+	}
+
+	/* We should neatly encode all values. */
+	for (i = 0; i < 48; i++) {
+		uint64_t h = 1ULL << (i < 5 ? i : 4);
+		uint64_t klen = 1ULL << (i < 16 ? i : 15);
+		uint64_t dlen = 1ULL << i;
+		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
+		ok1(set_header(&ntdb, &rec, NTDB_USED_MAGIC, klen, dlen,
+			       klen+dlen+xlen, h) == 0);
+		ok1(rec_key_length(&rec) == klen);
+		ok1(rec_data_length(&rec) == dlen);
+		ok1(rec_extra_padding(&rec) == xlen);
+		ok1((uint64_t)rec_hash(&rec) == h);
+		ok1(rec_magic(&rec) == NTDB_USED_MAGIC);
+	}
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-001-fls.c b/lib/ntdb/test/run-001-fls.c
new file mode 100644
index 0000000000..ec61294c6f
--- /dev/null
+++ b/lib/ntdb/test/run-001-fls.c
@@ -0,0 +1,33 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+
+static unsigned int dumb_fls(uint64_t num)
+{
+	int i;
+
+	for (i = 63; i >= 0; i--) {
+		if (num & (1ULL << i))
+			break;
+	}
+	return i + 1;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+
+	plan_tests(64 * 64 + 2);
+
+	ok1(fls64(0) == 0);
+	ok1(dumb_fls(0) == 0);
+
+	for (i = 0; i < 64; i++) {
+		for (j = 0; j < 64; j++) {
+			uint64_t val = (1ULL << i) | (1ULL << j);
+			ok(fls64(val) == dumb_fls(val),
+			   "%llu -> %u should be %u", (long long)val,
+			   fls64(val), dumb_fls(val));
+		}
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-01-new_database.c b/lib/ntdb/test/run-01-new_database.c
new file mode 100644
index 0000000000..ae70e86e07
--- /dev/null
+++ b/lib/ntdb/test/run-01-new_database.c
@@ -0,0 +1,34 @@
+#include <ccan/failtest/failtest_override.h>
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-new_database.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			failtest_exit(exit_status());
+
+		failtest_suppress = true;
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		failtest_suppress = false;
+		ntdb_close(ntdb);
+		if (!ok1(tap_log_messages == 0))
+			break;
+	}
+	failtest_exit(exit_status());
+}
diff --git a/lib/ntdb/test/run-02-expand.c b/lib/ntdb/test/run-02-expand.c
new file mode 100644
index 0000000000..abf1569388
--- /dev/null
+++ b/lib/ntdb/test/run-02-expand.c
@@ -0,0 +1,62 @@
+#include <ccan/failtest/failtest_override.h>
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	uint64_t val;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11 + 1);
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		failtest_suppress = true;
+		ntdb = ntdb_open("run-expand.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			break;
+
+		val = ntdb->file->map_size;
+		/* Need some hash lock for expand. */
+		ok1(ntdb_lock_hashes(ntdb, 0, 1, F_WRLCK, NTDB_LOCK_WAIT) == 0);
+		failtest_suppress = false;
+		if (!ok1(ntdb_expand(ntdb, 1) == 0)) {
+			failtest_suppress = true;
+			ntdb_close(ntdb);
+			break;
+		}
+		failtest_suppress = true;
+
+		ok1(ntdb->file->map_size >= val + 1 * NTDB_EXTENSION_FACTOR);
+		ok1(ntdb_unlock_hashes(ntdb, 0, 1, F_WRLCK) == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		val = ntdb->file->map_size;
+		ok1(ntdb_lock_hashes(ntdb, 0, 1, F_WRLCK, NTDB_LOCK_WAIT) == 0);
+		failtest_suppress = false;
+		if (!ok1(ntdb_expand(ntdb, 1024) == 0)) {
+			failtest_suppress = true;
+			ntdb_close(ntdb);
+			break;
+		}
+		failtest_suppress = true;
+		ok1(ntdb_unlock_hashes(ntdb, 0, 1, F_WRLCK) == 0);
+		ok1(ntdb->file->map_size >= val + 1024 * NTDB_EXTENSION_FACTOR);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	failtest_exit(exit_status());
+}
diff --git a/lib/ntdb/test/run-03-coalesce.c b/lib/ntdb/test/run-03-coalesce.c
new file mode 100644
index 0000000000..f93b33a1c3
--- /dev/null
+++ b/lib/ntdb/test/run-03-coalesce.c
@@ -0,0 +1,178 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+#include "layout.h"
+
+static ntdb_len_t free_record_length(struct ntdb_context *ntdb, ntdb_off_t off)
+{
+	struct ntdb_free_record f;
+	enum NTDB_ERROR ecode;
+
+	ecode = ntdb_read_convert(ntdb, off, &f, sizeof(f));
+	if (ecode != NTDB_SUCCESS)
+		return ecode;
+	if (frec_magic(&f) != NTDB_FREE_MAGIC)
+		return NTDB_ERR_CORRUPT;
+	return frec_len(&f);
+}
+
+int main(int argc, char *argv[])
+{
+	ntdb_off_t b_off, test;
+	struct ntdb_context *ntdb;
+	struct ntdb_layout *layout;
+	NTDB_DATA data, key;
+	ntdb_len_t len;
+
+	/* FIXME: Test NTDB_CONVERT */
+	/* FIXME: Test lock order fail. */
+
+	plan_tests(42);
+	data = ntdb_mkdata("world", 5);
+	key = ntdb_mkdata("hello", 5);
+
+	/* No coalescing can be done due to EOF */
+	layout = new_ntdb_layout();
+	ntdb_layout_add_freetable(layout);
+	len = 1024;
+	ntdb_layout_add_free(layout, len, 0);
+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
+	/* NOMMAP is for lockcheck. */
+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP, O_RDWR, 0,
+		       &tap_log_attr);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == len);
+
+	/* Figure out which bucket free entry is. */
+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(len));
+	/* Lock and fail to coalesce. */
+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
+	test = layout->elem[1].base.off;
+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, len, &test)
+	    == 0);
+	ntdb_unlock_free_bucket(ntdb, b_off);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == len);
+	ok1(test == layout->elem[1].base.off);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+	ntdb_close(ntdb);
+	ntdb_layout_free(layout);
+
+	/* No coalescing can be done due to used record */
+	layout = new_ntdb_layout();
+	ntdb_layout_add_freetable(layout);
+	ntdb_layout_add_free(layout, 1024, 0);
+	ntdb_layout_add_used(layout, key, data, 6);
+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
+	/* NOMMAP is for lockcheck. */
+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP, O_RDWR, 0,
+		       &tap_log_attr);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 1024);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Figure out which bucket free entry is. */
+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(1024));
+	/* Lock and fail to coalesce. */
+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
+	test = layout->elem[1].base.off;
+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, 1024, &test)
+	    == 0);
+	ntdb_unlock_free_bucket(ntdb, b_off);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 1024);
+	ok1(test == layout->elem[1].base.off);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+	ntdb_close(ntdb);
+	ntdb_layout_free(layout);
+
+	/* Coalescing can be done due to two free records, then EOF */
+	layout = new_ntdb_layout();
+	ntdb_layout_add_freetable(layout);
+	ntdb_layout_add_free(layout, 1024, 0);
+	ntdb_layout_add_free(layout, 2048, 0);
+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
+	/* NOMMAP is for lockcheck. */
+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP, O_RDWR, 0,
+		       &tap_log_attr);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 1024);
+	ok1(free_record_length(ntdb, layout->elem[2].base.off) == 2048);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Figure out which bucket (first) free entry is. */
+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(1024));
+	/* Lock and coalesce. */
+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
+	test = layout->elem[2].base.off;
+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, 1024, &test)
+	    == 1024 + sizeof(struct ntdb_used_record) + 2048);
+	/* Should tell us it's erased this one... */
+	ok1(test == NTDB_ERR_NOEXIST);
+	ok1(ntdb->file->allrecord_lock.count == 0 && ntdb->file->num_lockrecs == 0);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off)
+	    == 1024 + sizeof(struct ntdb_used_record) + 2048);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+	ntdb_close(ntdb);
+	ntdb_layout_free(layout);
+
+	/* Coalescing can be done due to two free records, then data */
+	layout = new_ntdb_layout();
+	ntdb_layout_add_freetable(layout);
+	ntdb_layout_add_free(layout, 1024, 0);
+	ntdb_layout_add_free(layout, 512, 0);
+	ntdb_layout_add_used(layout, key, data, 6);
+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
+	/* NOMMAP is for lockcheck. */
+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP, O_RDWR, 0,
+		       &tap_log_attr);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 1024);
+	ok1(free_record_length(ntdb, layout->elem[2].base.off) == 512);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Figure out which bucket free entry is. */
+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(1024));
+	/* Lock and coalesce. */
+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
+	test = layout->elem[2].base.off;
+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, 1024, &test)
+	    == 1024 + sizeof(struct ntdb_used_record) + 512);
+	ok1(ntdb->file->allrecord_lock.count == 0 && ntdb->file->num_lockrecs == 0);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off)
+	    == 1024 + sizeof(struct ntdb_used_record) + 512);
+	ok1(test == NTDB_ERR_NOEXIST);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+	ntdb_close(ntdb);
+	ntdb_layout_free(layout);
+
+	/* Coalescing can be done due to three free records, then EOF */
+	layout = new_ntdb_layout();
+	ntdb_layout_add_freetable(layout);
+	ntdb_layout_add_free(layout, 1024, 0);
+	ntdb_layout_add_free(layout, 512, 0);
+	ntdb_layout_add_free(layout, 256, 0);
+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
+	/* NOMMAP is for lockcheck. */
+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP, O_RDWR, 0,
+		       &tap_log_attr);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 1024);
+	ok1(free_record_length(ntdb, layout->elem[2].base.off) == 512);
+	ok1(free_record_length(ntdb, layout->elem[3].base.off) == 256);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	/* Figure out which bucket free entry is. */
+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(1024));
+	/* Lock and coalesce. */
+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
+	test = layout->elem[2].base.off;
+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, 1024, &test)
+	    == 1024 + sizeof(struct ntdb_used_record) + 512
+	    + sizeof(struct ntdb_used_record) + 256);
+	ok1(ntdb->file->allrecord_lock.count == 0
+	    && ntdb->file->num_lockrecs == 0);
+	ok1(free_record_length(ntdb, layout->elem[1].base.off)
+	    == 1024 + sizeof(struct ntdb_used_record) + 512
+	    + sizeof(struct ntdb_used_record) + 256);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+	ntdb_close(ntdb);
+	ntdb_layout_free(layout);
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-04-basichash.c b/lib/ntdb/test/run-04-basichash.c
new file mode 100644
index 0000000000..6e3bdc012d
--- /dev/null
+++ b/lib/ntdb/test/run-04-basichash.c
@@ -0,0 +1,260 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+/* We rig the hash so adjacent-numbered records always clash. */
+static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return ((uint64_t)*(const unsigned int *)key)
+		<< (64 - NTDB_TOPLEVEL_HASH_BITS - 1);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	unsigned int v;
+	struct ntdb_used_record rec;
+	NTDB_DATA key = { (unsigned char *)&v, sizeof(v) };
+	NTDB_DATA dbuf = { (unsigned char *)&v, sizeof(v) };
+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
+						.fn = clash } };
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT,
+	};
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (91 + (2 * ((1 << NTDB_HASH_GROUP_BITS) - 1))) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct hash_info h;
+		ntdb_off_t new_off, off, subhash;
+
+		ntdb = ntdb_open("run-04-basichash.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		v = 0;
+		/* Should not find it. */
+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+		/* Should have created correct hash. */
+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == offsetof(struct ntdb_header, hashtable));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS)));
+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
+		ok1((ntdb->flags & NTDB_NOLOCK)
+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		/* Allocate a new record. */
+		new_off = alloc(ntdb, key.dsize, dbuf.dsize, h.h,
+				NTDB_USED_MAGIC, false);
+		ok1(!NTDB_OFF_IS_ERR(new_off));
+
+		/* We should be able to add it now. */
+		ok1(add_to_hash(ntdb, &h, new_off) == 0);
+
+		/* Make sure we fill it in for later finding. */
+		off = new_off + sizeof(struct ntdb_used_record);
+		ok1(!ntdb->io->twrite(ntdb, off, key.dptr, key.dsize));
+		off += key.dsize;
+		ok1(!ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize));
+
+		/* We should be able to unlock that OK. */
+		ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Database should be consistent. */
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Now, this should give a successful lookup. */
+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL)
+		    == new_off);
+		/* Should have created correct hash. */
+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == offsetof(struct ntdb_header, hashtable));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS)));
+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
+		ok1((ntdb->flags & NTDB_NOLOCK)
+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Database should be consistent. */
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Test expansion. */
+		v = 1;
+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+		/* Should have created correct hash. */
+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 1. */
+		ok1(h.group_start == offsetof(struct ntdb_header, hashtable));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 1);
+		ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS)));
+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
+		ok1((ntdb->flags & NTDB_NOLOCK)
+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		/* Make it expand 0'th bucket. */
+		ok1(expand_group(ntdb, &h) == 0);
+		/* First one should be subhash, next should be empty. */
+		ok1(is_subhash(h.group[0]));
+		subhash = (h.group[0] & NTDB_OFF_MASK);
+		for (j = 1; j < (1 << NTDB_HASH_GROUP_BITS); j++)
+			ok1(h.group[j] == 0);
+
+		ok1(ntdb_write_convert(ntdb, h.group_start,
+				      h.group, sizeof(h.group)) == 0);
+		ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Should be happy with expansion. */
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Should be able to find it. */
+		v = 0;
+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL)
+		    == new_off);
+		/* Should have created correct hash. */
+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+		/* Should have located space in expanded group 0, bucket 0. */
+		ok1(h.group_start == subhash + sizeof(struct ntdb_used_record));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS
+		    + NTDB_SUBLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS)));
+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
+		ok1((ntdb->flags & NTDB_NOLOCK)
+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		/* Simple delete should work. */
+		ok1(delete_from_hash(ntdb, &h) == 0);
+		ok1(add_free_record(ntdb, new_off,
+				    sizeof(struct ntdb_used_record)
+				    + rec_key_length(&rec)
+				    + rec_data_length(&rec)
+				    + rec_extra_padding(&rec),
+				    NTDB_LOCK_NOWAIT, false) == 0);
+		ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Test second-level expansion: should expand 0th bucket. */
+		v = 0;
+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+		/* Should have created correct hash. */
+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == subhash + sizeof(struct ntdb_used_record));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS+NTDB_SUBLEVEL_HASH_BITS);
+
+		/* Should have lock on bucket 0 */
+		ok1(h.hlock_start == 0);
+		ok1(h.hlock_range ==
+		    1ULL << (64-(NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS)));
+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
+		ok1((ntdb->flags & NTDB_NOLOCK)
+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
+		/* FIXME: Check lock length */
+
+		ok1(expand_group(ntdb, &h) == 0);
+		/* First one should be subhash, next should be empty. */
+		ok1(is_subhash(h.group[0]));
+		subhash = (h.group[0] & NTDB_OFF_MASK);
+		for (j = 1; j < (1 << NTDB_HASH_GROUP_BITS); j++)
+			ok1(h.group[j] == 0);
+		ok1(ntdb_write_convert(ntdb, h.group_start,
+				      h.group, sizeof(h.group)) == 0);
+		ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Should be happy with expansion. */
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+		/* Should have created correct hash. */
+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == subhash + sizeof(struct ntdb_used_record));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS
+		    + NTDB_SUBLEVEL_HASH_BITS * 2);
+
+		/* We should be able to add it now. */
+		/* Allocate a new record. */
+		new_off = alloc(ntdb, key.dsize, dbuf.dsize, h.h,
+				NTDB_USED_MAGIC, false);
+		ok1(!NTDB_OFF_IS_ERR(new_off));
+		ok1(add_to_hash(ntdb, &h, new_off) == 0);
+
+		/* Make sure we fill it in for later finding. */
+		off = new_off + sizeof(struct ntdb_used_record);
+		ok1(!ntdb->io->twrite(ntdb, off, key.dptr, key.dsize));
+		off += key.dsize;
+		ok1(!ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize));
+
+		/* We should be able to unlock that OK. */
+		ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+				      F_WRLCK) == 0);
+
+		/* Database should be consistent. */
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Should be able to find it. */
+		v = 0;
+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL)
+		    == new_off);
+		/* Should have created correct hash. */
+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+		/* Should have located space in expanded group 0, bucket 0. */
+		ok1(h.group_start == subhash + sizeof(struct ntdb_used_record));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS
+		    + NTDB_SUBLEVEL_HASH_BITS * 2);
+
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-05-readonly-open.c b/lib/ntdb/test/run-05-readonly-open.c
new file mode 100644
index 0000000000..dd5aa26d0d
--- /dev/null
+++ b/lib/ntdb/test/run-05-readonly-open.c
@@ -0,0 +1,71 @@
+#include <ccan/failtest/failtest_override.h>
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4), d;
+	union ntdb_attribute seed_attr;
+	unsigned int msgs = 0;
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+
+	seed_attr.base.attr = NTDB_ATTRIBUTE_SEED;
+	seed_attr.base.next = &tap_log_attr;
+	seed_attr.seed.seed = 0;
+
+	failtest_suppress = true;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-05-readonly-open.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600,
+			       &seed_attr);
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+		ntdb_close(ntdb);
+
+		failtest_suppress = false;
+		ntdb = ntdb_open("run-05-readonly-open.ntdb", flags[i],
+			       O_RDONLY, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			break;
+		ok1(tap_log_messages == msgs);
+		/* Fetch should succeed, stores should fail. */
+		if (!ok1(ntdb_fetch(ntdb, key, &d) == 0))
+			goto fail;
+		ok1(ntdb_deq(d, data));
+		free(d.dptr);
+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY)
+			 == NTDB_ERR_RDONLY))
+			goto fail;
+		ok1(tap_log_messages == ++msgs);
+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_INSERT)
+			 == NTDB_ERR_RDONLY))
+			goto fail;
+		ok1(tap_log_messages == ++msgs);
+		failtest_suppress = true;
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ntdb_close(ntdb);
+		ok1(tap_log_messages == msgs);
+		/* SIGH: failtest bug, it doesn't save the ntdb file because
+		 * we have it read-only.  If we go around again, it gets
+		 * changed underneath us and things get screwy. */
+		if (failtest_has_failed())
+			break;
+	}
+	failtest_exit(exit_status());
+
+fail:
+	failtest_suppress = true;
+	ntdb_close(ntdb);
+	failtest_exit(exit_status());
+}
diff --git a/lib/ntdb/test/run-10-simple-store.c b/lib/ntdb/test/run-10-simple-store.c
new file mode 100644
index 0000000000..6e718bf61f
--- /dev/null
+++ b/lib/ntdb/test/run-10-simple-store.c
@@ -0,0 +1,58 @@
+#include <ccan/failtest/failtest_override.h>
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+
+	failtest_suppress = true;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-10-simple-store.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			break;
+		/* Modify should fail. */
+		failtest_suppress = false;
+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY)
+			 == NTDB_ERR_NOEXIST))
+			goto fail;
+		failtest_suppress = true;
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		/* Insert should succeed. */
+		failtest_suppress = false;
+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0))
+			goto fail;
+		failtest_suppress = true;
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		/* Second insert should fail. */
+		failtest_suppress = false;
+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_INSERT)
+			 == NTDB_ERR_EXISTS))
+			goto fail;
+		failtest_suppress = true;
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ntdb_close(ntdb);
+	}
+	ok1(tap_log_messages == 0);
+	failtest_exit(exit_status());
+
+fail:
+	failtest_suppress = true;
+	ntdb_close(ntdb);
+	failtest_exit(exit_status());
+}
diff --git a/lib/ntdb/test/run-11-simple-fetch.c b/lib/ntdb/test/run-11-simple-fetch.c
new file mode 100644
index 0000000000..525cf46444
--- /dev/null
+++ b/lib/ntdb/test/run-11-simple-fetch.c
@@ -0,0 +1,58 @@
+#include <ccan/failtest/failtest_override.h>
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+
+	failtest_suppress = true;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-11-simple-fetch.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (ntdb) {
+			NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
+
+			/* fetch should fail. */
+			failtest_suppress = false;
+			if (!ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_NOEXIST))
+				goto fail;
+			failtest_suppress = true;
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			/* Insert should succeed. */
+			ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			/* Fetch should now work. */
+			failtest_suppress = false;
+			if (!ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS))
+				goto fail;
+			failtest_suppress = true;
+			ok1(ntdb_deq(d, data));
+			free(d.dptr);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			ntdb_close(ntdb);
+		}
+	}
+	ok1(tap_log_messages == 0);
+	failtest_exit(exit_status());
+
+fail:
+	failtest_suppress = true;
+	ntdb_close(ntdb);
+	failtest_exit(exit_status());
+}
diff --git a/lib/ntdb/test/run-12-check.c b/lib/ntdb/test/run-12-check.c
new file mode 100644
index 0000000000..6040637048
--- /dev/null
+++ b/lib/ntdb/test/run-12-check.c
@@ -0,0 +1,46 @@
+#include "private.h"
+#include <ccan/failtest/failtest_override.h>
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL,
+			NTDB_INTERNAL|NTDB_CONVERT,
+			NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+
+	failtest_suppress = true;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-12-check.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+
+		/* This is what we really want to test: ntdb_check(). */
+		failtest_suppress = false;
+		if (!ok1(ntdb_check(ntdb, NULL, NULL) == 0))
+			goto fail;
+		failtest_suppress = true;
+
+		ntdb_close(ntdb);
+	}
+	ok1(tap_log_messages == 0);
+	failtest_exit(exit_status());
+
+fail:
+	failtest_suppress = true;
+	ntdb_close(ntdb);
+	failtest_exit(exit_status());
+}
diff --git a/lib/ntdb/test/run-15-append.c b/lib/ntdb/test/run-15-append.c
new file mode 100644
index 0000000000..3c208137f2
--- /dev/null
+++ b/lib/ntdb/test/run-15-append.c
@@ -0,0 +1,130 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <ccan/ilog/ilog.h>
+#include "logging.h"
+
+#define MAX_SIZE 13100
+#define SIZE_STEP 131
+
+static ntdb_off_t ntdb_offset(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+	ntdb_off_t off;
+	struct ntdb_used_record urec;
+	struct hash_info h;
+
+	off = find_and_lock(ntdb, key, F_RDLCK, &h, &urec, NULL);
+	if (NTDB_OFF_IS_ERR(off))
+		return 0;
+	ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_RDLCK);
+	return off;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, moves;
+	struct ntdb_context *ntdb;
+	unsigned char *buffer;
+	ntdb_off_t oldoff = 0, newoff;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data;
+
+	buffer = malloc(MAX_SIZE);
+	for (i = 0; i < MAX_SIZE; i++)
+		buffer[i] = i;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * ((3 + MAX_SIZE/SIZE_STEP * 5) * 2 + 7)
+		   + 1);
+
+	/* Using ntdb_store. */
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-append.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		moves = 0;
+		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
+			data.dptr = buffer;
+			data.dsize = j;
+			ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
+			ok1(data.dsize == j);
+			ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+			free(data.dptr);
+			newoff = ntdb_offset(ntdb, key);
+			if (newoff != oldoff)
+				moves++;
+			oldoff = newoff;
+		}
+		ok1(!ntdb->file || (ntdb->file->allrecord_lock.count == 0
+				   && ntdb->file->num_lockrecs == 0));
+		/* We should increase by 50% each time... */
+		ok(moves <= ilog64(j / SIZE_STEP)*2,
+		   "Moved %u times", moves);
+		ntdb_close(ntdb);
+	}
+
+	/* Using ntdb_append. */
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		size_t prev_len = 0;
+		ntdb = ntdb_open("run-append.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		moves = 0;
+		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
+			data.dptr = buffer + prev_len;
+			data.dsize = j - prev_len;
+			ok1(ntdb_append(ntdb, key, data) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
+			ok1(data.dsize == j);
+			ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+			free(data.dptr);
+			prev_len = data.dsize;
+			newoff = ntdb_offset(ntdb, key);
+			if (newoff != oldoff)
+				moves++;
+			oldoff = newoff;
+		}
+		ok1(!ntdb->file || (ntdb->file->allrecord_lock.count == 0
+				   && ntdb->file->num_lockrecs == 0));
+		/* We should increase by 50% each time... */
+		ok(moves <= ilog64(j / SIZE_STEP)*2,
+		   "Moved %u times", moves);
+		ntdb_close(ntdb);
+	}
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-append.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		/* Huge initial store. */
+		data.dptr = buffer;
+		data.dsize = MAX_SIZE;
+		ok1(ntdb_append(ntdb, key, data) == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
+		ok1(data.dsize == MAX_SIZE);
+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+		free(data.dptr);
+		ok1(!ntdb->file || (ntdb->file->allrecord_lock.count == 0
+				   && ntdb->file->num_lockrecs == 0));
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	free(buffer);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-20-growhash.c b/lib/ntdb/test/run-20-growhash.c
new file mode 100644
index 0000000000..5559370f2a
--- /dev/null
+++ b/lib/ntdb/test/run-20-growhash.c
@@ -0,0 +1,137 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+static uint64_t myhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return *(const uint64_t *)key;
+}
+
+static void add_bits(uint64_t *val, unsigned new, unsigned new_bits,
+		     unsigned *done)
+{
+	*done += new_bits;
+	*val |= ((uint64_t)new << (64 - *done));
+}
+
+static uint64_t make_key(unsigned topgroup, unsigned topbucket,
+			 unsigned subgroup1, unsigned subbucket1,
+			 unsigned subgroup2, unsigned subbucket2)
+{
+	uint64_t key = 0;
+	unsigned done = 0;
+
+	add_bits(&key, topgroup, NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS,
+		 &done);
+	add_bits(&key, topbucket, NTDB_HASH_GROUP_BITS, &done);
+	add_bits(&key, subgroup1, NTDB_SUBLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS,
+		 &done);
+	add_bits(&key, subbucket1, NTDB_HASH_GROUP_BITS, &done);
+	add_bits(&key, subgroup2, NTDB_SUBLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS,
+		 &done);
+	add_bits(&key, subbucket2, NTDB_HASH_GROUP_BITS, &done);
+	return key;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	uint64_t kdata;
+	struct ntdb_used_record rec;
+	NTDB_DATA key = { (unsigned char *)&kdata, sizeof(kdata) };
+	NTDB_DATA dbuf = { (unsigned char *)&kdata, sizeof(kdata) };
+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
+						.fn = myhash } };
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT,
+	};
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0])
+		   * (9 + (20 + 2 * ((1 << NTDB_HASH_GROUP_BITS) - 2))
+		      * (1 << NTDB_HASH_GROUP_BITS)) + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct hash_info h;
+
+		ntdb = ntdb_open("run-20-growhash.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		/* Fill a group. */
+		for (j = 0; j < (1 << NTDB_HASH_GROUP_BITS); j++) {
+			kdata = make_key(0, j, 0, 0, 0, 0);
+			ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
+		}
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Check first still exists. */
+		kdata = make_key(0, 0, 0, 0, 0, 0);
+		ok1(find_and_lock(ntdb, key, F_RDLCK, &h, &rec, NULL) != 0);
+		/* Should have created correct hash. */
+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+		/* Should have located space in group 0, bucket 0. */
+		ok1(h.group_start == offsetof(struct ntdb_header, hashtable));
+		ok1(h.home_bucket == 0);
+		ok1(h.found_bucket == 0);
+		ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS);
+		/* Entire group should be full! */
+		for (j = 0; j < (1 << NTDB_HASH_GROUP_BITS); j++)
+			ok1(h.group[j] != 0);
+
+		ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+				      F_RDLCK) == 0);
+
+		/* Now, add one more to each should expand (that) bucket. */
+		for (j = 0; j < (1 << NTDB_HASH_GROUP_BITS); j++) {
+			unsigned int k;
+			kdata = make_key(0, j, 0, 1, 0, 0);
+			ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+			ok1(find_and_lock(ntdb, key, F_RDLCK, &h, &rec, NULL));
+			/* Should have created correct hash. */
+			ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+			/* Should have moved to subhash */
+			ok1(h.group_start >= sizeof(struct ntdb_header));
+			ok1(h.home_bucket == 1);
+			ok1(h.found_bucket == 1);
+			ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS
+			    + NTDB_SUBLEVEL_HASH_BITS);
+			ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+					      F_RDLCK) == 0);
+
+			/* Keep adding, make it expand again. */
+			for (k = 2; k < (1 << NTDB_HASH_GROUP_BITS); k++) {
+				kdata = make_key(0, j, 0, k, 0, 0);
+				ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
+				ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+			}
+
+			/* This should tip it over to sub-sub-hash. */
+			kdata = make_key(0, j, 0, 0, 0, 1);
+			ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+			ok1(find_and_lock(ntdb, key, F_RDLCK, &h, &rec, NULL));
+			/* Should have created correct hash. */
+			ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
+			/* Should have moved to subhash */
+			ok1(h.group_start >= sizeof(struct ntdb_header));
+			ok1(h.home_bucket == 1);
+			ok1(h.found_bucket == 1);
+			ok1(h.hash_used == NTDB_TOPLEVEL_HASH_BITS
+			    + NTDB_SUBLEVEL_HASH_BITS + NTDB_SUBLEVEL_HASH_BITS);
+			ok1(ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range,
+					      F_RDLCK) == 0);
+		}
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-25-hashoverload.c b/lib/ntdb/test/run-25-hashoverload.c
new file mode 100644
index 0000000000..611eb71bf6
--- /dev/null
+++ b/lib/ntdb/test/run-25-hashoverload.c
@@ -0,0 +1,113 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return 0;
+}
+
+static int trav(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *p)
+{
+	if (p)
+		return ntdb_delete(ntdb, key);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
+	NTDB_DATA dbuf = { (unsigned char *)&j, sizeof(j) };
+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
+						.fn = badhash } };
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT,
+	};
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(6883);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
+
+		ntdb = ntdb_open("run-25-hashoverload.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		/* Fill a group. */
+		for (j = 0; j < (1 << NTDB_HASH_GROUP_BITS); j++) {
+			ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
+		}
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Now store one last value: should form chain. */
+		ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Check we can find them all. */
+		for (j = 0; j < (1 << NTDB_HASH_GROUP_BITS) + 1; j++) {
+			ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+			free(d.dptr);
+		}
+
+		/* Now add a *lot* more. */
+		for (j = (1 << NTDB_HASH_GROUP_BITS) + 1;
+		     j < (16 << NTDB_HASH_GROUP_BITS);
+		     j++) {
+			ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
+			ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+			free(d.dptr);
+		}
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Traverse through them. */
+		ok1(ntdb_traverse(ntdb, trav, NULL) == j);
+
+		/* Empty the first chain-worth. */
+		for (j = 0; j < (1 << NTDB_HASH_GROUP_BITS); j++)
+			ok1(ntdb_delete(ntdb, key) == 0);
+
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		for (j = (1 << NTDB_HASH_GROUP_BITS);
+		     j < (16 << NTDB_HASH_GROUP_BITS);
+		     j++) {
+			ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+			free(d.dptr);
+		}
+
+		/* Traverse through them. */
+		ok1(ntdb_traverse(ntdb, trav, NULL)
+		    == (15 << NTDB_HASH_GROUP_BITS));
+
+		/* Re-add */
+		for (j = 0; j < (1 << NTDB_HASH_GROUP_BITS); j++) {
+			ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
+		}
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Now try deleting as we go. */
+		ok1(ntdb_traverse(ntdb, trav, trav)
+		    == (16 << NTDB_HASH_GROUP_BITS));
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(ntdb_traverse(ntdb, trav, NULL) == 0);
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-30-exhaust-before-expand.c b/lib/ntdb/test/run-30-exhaust-before-expand.c
new file mode 100644
index 0000000000..b94bc01bff
--- /dev/null
+++ b/lib/ntdb/test/run-30-exhaust-before-expand.c
@@ -0,0 +1,71 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+static bool empty_freetable(struct ntdb_context *ntdb)
+{
+	struct ntdb_freetable ftab;
+	unsigned int i;
+
+	/* Now, free table should be completely exhausted in zone 0 */
+	if (ntdb_read_convert(ntdb, ntdb->ftable_off, &ftab, sizeof(ftab)) != 0)
+		abort();
+
+	for (i = 0; i < sizeof(ftab.buckets)/sizeof(ftab.buckets[0]); i++) {
+		if (ftab.buckets[i])
+			return false;
+	}
+	return true;
+}
+
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 9 + 1);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		NTDB_DATA k;
+		uint64_t size;
+		bool was_empty = false;
+
+		k.dptr = (void *)&j;
+		k.dsize = sizeof(j);
+
+		ntdb = ntdb_open("run-30-exhaust-before-expand.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		ok1(empty_freetable(ntdb));
+		/* Need some hash lock for expand. */
+		ok1(ntdb_lock_hashes(ntdb, 0, 1, F_WRLCK, NTDB_LOCK_WAIT) == 0);
+		/* Create some free space. */
+		ok1(ntdb_expand(ntdb, 1) == 0);
+		ok1(ntdb_unlock_hashes(ntdb, 0, 1, F_WRLCK) == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(!empty_freetable(ntdb));
+
+		size = ntdb->file->map_size;
+		/* Insert minimal-length records until we expand. */
+		for (j = 0; ntdb->file->map_size == size; j++) {
+			was_empty = empty_freetable(ntdb);
+			if (ntdb_store(ntdb, k, k, NTDB_INSERT) != 0)
+				err(1, "Failed to store record %i", j);
+		}
+
+		/* Would have been empty before expansion, but no longer. */
+		ok1(was_empty);
+		ok1(!empty_freetable(ntdb));
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-35-convert.c b/lib/ntdb/test/run-35-convert.c
new file mode 100644
index 0000000000..6a38d425cb
--- /dev/null
+++ b/lib/ntdb/test/run-35-convert.c
@@ -0,0 +1,54 @@
+#include "private.h"
+#include <ccan/failtest/failtest_override.h>
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, messages = 0;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-35-convert.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		if (!ok1(ntdb))
+			failtest_exit(exit_status());
+
+		ntdb_close(ntdb);
+		/* If we say NTDB_CONVERT, it must be converted */
+		ntdb = ntdb_open("run-35-convert.ntdb",
+			       flags[i]|NTDB_CONVERT,
+			       O_RDWR, 0600, &tap_log_attr);
+		if (flags[i] & NTDB_CONVERT) {
+			if (!ntdb)
+				failtest_exit(exit_status());
+			ok1(ntdb_get_flags(ntdb) & NTDB_CONVERT);
+			ntdb_close(ntdb);
+		} else {
+			if (!ok1(!ntdb && errno == EIO))
+				failtest_exit(exit_status());
+			ok1(tap_log_messages == ++messages);
+			if (!ok1(log_last && strstr(log_last, "NTDB_CONVERT")))
+				failtest_exit(exit_status());
+		}
+
+		/* If don't say NTDB_CONVERT, it *may* be converted */
+		ntdb = ntdb_open("run-35-convert.ntdb",
+			       flags[i] & ~NTDB_CONVERT,
+			       O_RDWR, 0600, &tap_log_attr);
+		if (!ntdb)
+			failtest_exit(exit_status());
+		ok1(ntdb_get_flags(ntdb) == flags[i]);
+		ntdb_close(ntdb);
+	}
+	failtest_exit(exit_status());
+}
diff --git a/lib/ntdb/test/run-50-multiple-freelists.c b/lib/ntdb/test/run-50-multiple-freelists.c
new file mode 100644
index 0000000000..962462e5d4
--- /dev/null
+++ b/lib/ntdb/test/run-50-multiple-freelists.c
@@ -0,0 +1,70 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+#include "layout.h"
+
+int main(int argc, char *argv[])
+{
+	ntdb_off_t off;
+	struct ntdb_context *ntdb;
+	struct ntdb_layout *layout;
+	NTDB_DATA key, data;
+	union ntdb_attribute seed;
+
+	/* This seed value previously tickled a layout.c bug. */
+	seed.base.attr = NTDB_ATTRIBUTE_SEED;
+	seed.seed.seed = 0xb1142bc054d035b4ULL;
+	seed.base.next = &tap_log_attr;
+
+	plan_tests(11);
+	key = ntdb_mkdata("Hello", 5);
+	data = ntdb_mkdata("world", 5);
+
+	/* Create a NTDB with three free tables. */
+	layout = new_ntdb_layout();
+	ntdb_layout_add_freetable(layout);
+	ntdb_layout_add_freetable(layout);
+	ntdb_layout_add_freetable(layout);
+	ntdb_layout_add_free(layout, 80, 0);
+	/* Used record prevent coalescing. */
+	ntdb_layout_add_used(layout, key, data, 6);
+	ntdb_layout_add_free(layout, 160, 1);
+	key.dsize--;
+	ntdb_layout_add_used(layout, key, data, 7);
+	ntdb_layout_add_free(layout, 320, 2);
+	key.dsize--;
+	ntdb_layout_add_used(layout, key, data, 8);
+	ntdb_layout_add_free(layout, 40, 0);
+	ntdb = ntdb_layout_get(layout, free, &seed);
+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+	off = get_free(ntdb, 0, 80 - sizeof(struct ntdb_used_record), 0,
+		       NTDB_USED_MAGIC, 0);
+	ok1(off == layout->elem[3].base.off);
+	ok1(ntdb->ftable_off == layout->elem[0].base.off);
+
+	off = get_free(ntdb, 0, 160 - sizeof(struct ntdb_used_record), 0,
+		       NTDB_USED_MAGIC, 0);
+	ok1(off == layout->elem[5].base.off);
+	ok1(ntdb->ftable_off == layout->elem[1].base.off);
+
+	off = get_free(ntdb, 0, 320 - sizeof(struct ntdb_used_record), 0,
+		       NTDB_USED_MAGIC, 0);
+	ok1(off == layout->elem[7].base.off);
+	ok1(ntdb->ftable_off == layout->elem[2].base.off);
+
+	off = get_free(ntdb, 0, 40 - sizeof(struct ntdb_used_record), 0,
+		       NTDB_USED_MAGIC, 0);
+	ok1(off == layout->elem[9].base.off);
+	ok1(ntdb->ftable_off == layout->elem[0].base.off);
+
+	/* Now we fail. */
+	off = get_free(ntdb, 0, 0, 1, NTDB_USED_MAGIC, 0);
+	ok1(off == 0);
+
+	ntdb_close(ntdb);
+	ntdb_layout_free(layout);
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-56-open-during-transaction.c b/lib/ntdb/test/run-56-open-during-transaction.c
new file mode 100644
index 0000000000..f585aa13c8
--- /dev/null
+++ b/lib/ntdb/test/run-56-open-during-transaction.c
@@ -0,0 +1,165 @@
+#include "private.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+
+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
+static ssize_t write_check(int fd, const void *buf, size_t count);
+static int ftruncate_check(int fd, off_t length);
+
+#define pwrite pwrite_check
+#define write write_check
+#define fcntl fcntl_with_lockcheck
+#define ftruncate ftruncate_check
+
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static struct agent *agent;
+static bool opened;
+static int errors = 0;
+#define TEST_DBNAME "run-56-open-during-transaction.ntdb"
+
+#undef write
+#undef pwrite
+#undef fcntl
+#undef ftruncate
+
+static bool is_same(const char *snapshot, const char *latest, off_t len)
+{
+	unsigned i;
+
+	for (i = 0; i < len; i++) {
+		if (snapshot[i] != latest[i])
+			return false;
+	}
+	return true;
+}
+
+static bool compare_file(int fd, const char *snapshot, off_t snapshot_len)
+{
+	char *contents;
+	bool ret;
+
+	/* over-length read serves as length check. */
+	contents = malloc(snapshot_len+1);
+	ret = pread(fd, contents, snapshot_len+1, 0) == snapshot_len
+		&& is_same(snapshot, contents, snapshot_len);
+	free(contents);
+	return ret;
+}
+
+static void check_file_intact(int fd)
+{
+	enum agent_return ret;
+	struct stat st;
+	char *contents;
+
+	fstat(fd, &st);
+	contents = malloc(st.st_size);
+	if (pread(fd, contents, st.st_size, 0) != st.st_size) {
+		diag("Read fail");
+		errors++;
+		return;
+	}
+
+	/* Ask agent to open file. */
+	ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+
+	/* It's OK to open it, but it must not have changed! */
+	if (!compare_file(fd, contents, st.st_size)) {
+		diag("Agent changed file after opening %s",
+		     agent_return_name(ret));
+		errors++;
+	}
+
+	if (ret == SUCCESS) {
+		ret = external_agent_operation(agent, CLOSE, NULL);
+		if (ret != SUCCESS) {
+			diag("Agent failed to close ntdb: %s",
+			     agent_return_name(ret));
+			errors++;
+		}
+	} else if (ret != WOULD_HAVE_BLOCKED) {
+		diag("Agent opening file gave %s",
+		     agent_return_name(ret));
+		errors++;
+	}
+
+	free(contents);
+}
+
+static void after_unlock(int fd)
+{
+	if (opened)
+		check_file_intact(fd);
+}
+
+static ssize_t pwrite_check(int fd,
+			    const void *buf, size_t count, off_t offset)
+{
+	if (opened)
+		check_file_intact(fd);
+
+	return pwrite(fd, buf, count, offset);
+}
+
+static ssize_t write_check(int fd, const void *buf, size_t count)
+{
+	if (opened)
+		check_file_intact(fd);
+
+	return write(fd, buf, count);
+}
+
+static int ftruncate_check(int fd, off_t length)
+{
+	if (opened)
+		check_file_intact(fd);
+
+	return ftruncate(fd, length);
+
+}
+
+int main(int argc, char *argv[])
+{
+	const int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	int i;
+	struct ntdb_context *ntdb;
+	NTDB_DATA key, data;
+
+	plan_tests(sizeof(flags)/sizeof(flags[0]) * 5);
+	agent = prepare_external_agent();
+	if (!agent)
+		err(1, "preparing agent");
+
+	unlock_callback = after_unlock;
+	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
+		diag("Test with %s and %s\n",
+		     (flags[i] & NTDB_CONVERT) ? "CONVERT" : "DEFAULT",
+		     (flags[i] & NTDB_NOMMAP) ? "no mmap" : "mmap");
+		unlink(TEST_DBNAME);
+		ntdb = ntdb_open(TEST_DBNAME, flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+
+		opened = true;
+		ok1(ntdb_transaction_start(ntdb) == 0);
+		key = ntdb_mkdata("hi", strlen("hi"));
+		data = ntdb_mkdata("world", strlen("world"));
+
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+		ok1(ntdb_transaction_commit(ntdb) == 0);
+		ok(!errors, "We had %u open errors", errors);
+
+		opened = false;
+		ntdb_close(ntdb);
+	}
+
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-57-die-during-transaction.c b/lib/ntdb/test/run-57-die-during-transaction.c
new file mode 100644
index 0000000000..98ec9dd63a
--- /dev/null
+++ b/lib/ntdb/test/run-57-die-during-transaction.c
@@ -0,0 +1,294 @@
+#include "private.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include <assert.h>
+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
+static ssize_t write_check(int fd, const void *buf, size_t count);
+static int ftruncate_check(int fd, off_t length);
+
+#define pwrite pwrite_check
+#define write write_check
+#define fcntl fcntl_with_lockcheck
+#define ftruncate ftruncate_check
+
+/* There's a malloc inside transaction_setup_recovery, and valgrind complains
+ * when we longjmp and leak it. */
+#define MAX_ALLOCATIONS 10
+static void *allocated[MAX_ALLOCATIONS];
+static unsigned max_alloc = 0;
+
+static void *malloc_noleak(size_t len)
+{
+	unsigned int i;
+
+	for (i = 0; i < MAX_ALLOCATIONS; i++)
+		if (!allocated[i]) {
+			allocated[i] = malloc(len);
+			if (i > max_alloc) {
+				max_alloc = i;
+				diag("max_alloc: %i", max_alloc);
+			}
+			return allocated[i];
+		}
+	diag("Too many allocations!");
+	abort();
+}
+
+static void *realloc_noleak(void *p, size_t size)
+{
+	unsigned int i;
+
+	for (i = 0; i < MAX_ALLOCATIONS; i++) {
+		if (allocated[i] == p) {
+			if (i > max_alloc) {
+				max_alloc = i;
+				diag("max_alloc: %i", max_alloc);
+			}
+			return allocated[i] = realloc(p, size);
+		}
+	}
+	diag("Untracked realloc!");
+	abort();
+}
+
+static void free_noleak(void *p)
+{
+	unsigned int i;
+
+	/* We don't catch asprintf, so don't complain if we miss one. */
+	for (i = 0; i < MAX_ALLOCATIONS; i++) {
+		if (allocated[i] == p) {
+			allocated[i] = NULL;
+			break;
+		}
+	}
+	free(p);
+}
+
+static void free_all(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < MAX_ALLOCATIONS; i++) {
+		free(allocated[i]);
+		allocated[i] = NULL;
+	}
+}
+
+#define malloc malloc_noleak
+#define free free_noleak
+#define realloc realloc_noleak
+
+#include "ntdb-source.h"
+
+#undef malloc
+#undef free
+#undef realloc
+#undef write
+#undef pwrite
+#undef fcntl
+#undef ftruncate
+
+#include <stdbool.h>
+#include <stdarg.h>
+#include <ccan/err/err.h>
+#include <setjmp.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static bool in_transaction;
+static int target, current;
+static jmp_buf jmpbuf;
+#define TEST_DBNAME "run-57-die-during-transaction.ntdb"
+#define KEY_STRING "helloworld"
+
+static void maybe_die(int fd)
+{
+	if (in_transaction && current++ == target) {
+		longjmp(jmpbuf, 1);
+	}
+}
+
+static ssize_t pwrite_check(int fd,
+			    const void *buf, size_t count, off_t offset)
+{
+	ssize_t ret;
+
+	maybe_die(fd);
+
+	ret = pwrite(fd, buf, count, offset);
+	if (ret != count)
+		return ret;
+
+	maybe_die(fd);
+	return ret;
+}
+
+static ssize_t write_check(int fd, const void *buf, size_t count)
+{
+	ssize_t ret;
+
+	maybe_die(fd);
+
+	ret = write(fd, buf, count);
+	if (ret != count)
+		return ret;
+
+	maybe_die(fd);
+	return ret;
+}
+
+static int ftruncate_check(int fd, off_t length)
+{
+	int ret;
+
+	maybe_die(fd);
+
+	ret = ftruncate(fd, length);
+
+	maybe_die(fd);
+	return ret;
+}
+
+static bool test_death(enum operation op, struct agent *agent)
+{
+	struct ntdb_context *ntdb = NULL;
+	NTDB_DATA key;
+	enum agent_return ret;
+	int needed_recovery = 0;
+
+	current = target = 0;
+reset:
+	unlink(TEST_DBNAME);
+	ntdb = ntdb_open(TEST_DBNAME, NTDB_NOMMAP,
+		       O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr);
+	if (!ntdb) {
+		diag("Failed opening NTDB: %s", strerror(errno));
+		return false;
+	}
+
+	if (setjmp(jmpbuf) != 0) {
+		/* We're partway through.  Simulate our death. */
+		close(ntdb->file->fd);
+		forget_locking();
+		in_transaction = false;
+
+		ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
+		if (ret == SUCCESS)
+			needed_recovery++;
+		else if (ret != FAILED) {
+			diag("Step %u agent NEEDS_RECOVERY = %s", current,
+			     agent_return_name(ret));
+			return false;
+		}
+
+		ret = external_agent_operation(agent, op, KEY_STRING);
+		if (ret != SUCCESS) {
+			diag("Step %u op %s failed = %s", current,
+			     operation_name(op),
+			     agent_return_name(ret));
+			return false;
+		}
+
+		ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
+		if (ret != FAILED) {
+			diag("Still needs recovery after step %u = %s",
+			     current, agent_return_name(ret));
+			return false;
+		}
+
+		ret = external_agent_operation(agent, CHECK, "");
+		if (ret != SUCCESS) {
+			diag("Step %u check failed = %s", current,
+			     agent_return_name(ret));
+			return false;
+		}
+
+		ret = external_agent_operation(agent, CLOSE, "");
+		if (ret != SUCCESS) {
+			diag("Step %u close failed = %s", current,
+			     agent_return_name(ret));
+			return false;
+		}
+
+		/* Suppress logging as this tries to use closed fd. */
+		suppress_logging = true;
+		suppress_lockcheck = true;
+		ntdb_close(ntdb);
+		suppress_logging = false;
+		suppress_lockcheck = false;
+		target++;
+		current = 0;
+		free_all();
+		goto reset;
+	}
+
+	/* Put key for agent to fetch. */
+	key = ntdb_mkdata(KEY_STRING, strlen(KEY_STRING));
+	if (ntdb_store(ntdb, key, key, NTDB_INSERT) != 0)
+		return false;
+
+	/* This is the key we insert in transaction. */
+	key.dsize--;
+
+	ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+	if (ret != SUCCESS)
+		errx(1, "Agent failed to open: %s", agent_return_name(ret));
+
+	ret = external_agent_operation(agent, FETCH, KEY_STRING);
+	if (ret != SUCCESS)
+		errx(1, "Agent failed find key: %s", agent_return_name(ret));
+
+	in_transaction = true;
+	if (ntdb_transaction_start(ntdb) != 0)
+		return false;
+
+	if (ntdb_store(ntdb, key, key, NTDB_INSERT) != 0)
+		return false;
+
+	if (ntdb_transaction_commit(ntdb) != 0)
+		return false;
+
+	in_transaction = false;
+
+	/* We made it! */
+	diag("Completed %u runs", current);
+	ntdb_close(ntdb);
+	ret = external_agent_operation(agent, CLOSE, "");
+	if (ret != SUCCESS) {
+		diag("Step %u close failed = %s", current,
+		     agent_return_name(ret));
+		return false;
+	}
+
+	ok1(needed_recovery);
+	ok1(locking_errors == 0);
+	ok1(forget_locking() == 0);
+	locking_errors = 0;
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	enum operation ops[] = { FETCH, STORE, TRANSACTION_START };
+	struct agent *agent;
+	int i;
+
+	plan_tests(12);
+	unlock_callback = maybe_die;
+
+	external_agent_free = free_noleak;
+	agent = prepare_external_agent();
+	if (!agent)
+		err(1, "preparing agent");
+
+	for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) {
+		diag("Testing %s after death", operation_name(ops[i]));
+		ok1(test_death(ops[i], agent));
+	}
+
+	free_external_agent(agent);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-64-bit-tdb.c b/lib/ntdb/test/run-64-bit-tdb.c
new file mode 100644
index 0000000000..6a146cb1cf
--- /dev/null
+++ b/lib/ntdb/test/run-64-bit-tdb.c
@@ -0,0 +1,72 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	if (sizeof(off_t) <= 4) {
+		plan_tests(1);
+		pass("No 64 bit off_t");
+		return exit_status();
+	}
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		off_t old_size;
+		NTDB_DATA k, d;
+		struct hash_info h;
+		struct ntdb_used_record rec;
+		ntdb_off_t off;
+
+		ntdb = ntdb_open("run-64-bit-ntdb.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		old_size = ntdb->file->map_size;
+
+		/* This makes a sparse file */
+		ok1(ftruncate(ntdb->file->fd, 0xFFFFFFF0) == 0);
+		ok1(add_free_record(ntdb, old_size, 0xFFFFFFF0 - old_size,
+				    NTDB_LOCK_WAIT, false) == NTDB_SUCCESS);
+
+		/* Now add a little record past the 4G barrier. */
+		ok1(ntdb_expand_file(ntdb, 100) == NTDB_SUCCESS);
+		ok1(add_free_record(ntdb, 0xFFFFFFF0, 100, NTDB_LOCK_WAIT, false)
+		    == NTDB_SUCCESS);
+
+		ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+
+		/* Test allocation path. */
+		k = ntdb_mkdata("key", 4);
+		d = ntdb_mkdata("data", 5);
+		ok1(ntdb_store(ntdb, k, d, NTDB_INSERT) == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+
+		/* Make sure it put it at end as we expected. */
+		off = find_and_lock(ntdb, k, F_RDLCK, &h, &rec, NULL);
+		ok1(off >= 0xFFFFFFF0);
+		ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+		ok1(ntdb_fetch(ntdb, k, &d) == 0);
+		ok1(d.dsize == 5);
+		ok1(strcmp((char *)d.dptr, "data") == 0);
+		free(d.dptr);
+
+		ok1(ntdb_delete(ntdb, k) == 0);
+		ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+
+		ntdb_close(ntdb);
+	}
+
+	/* We might get messages about mmap failing, so don't test
+	 * tap_log_messages */
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-90-get-set-attributes.c b/lib/ntdb/test/run-90-get-set-attributes.c
new file mode 100644
index 0000000000..fc265b0729
--- /dev/null
+++ b/lib/ntdb/test/run-90-get-set-attributes.c
@@ -0,0 +1,159 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
+		  void *unused)
+{
+	return 0;
+}
+
+static int myunlock(int fd, int rw, off_t off, off_t len, void *unused)
+{
+	return 0;
+}
+
+static uint64_t hash_fn(const void *key, size_t len, uint64_t seed,
+			void *priv)
+{
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	union ntdb_attribute seed_attr;
+	union ntdb_attribute hash_attr;
+	union ntdb_attribute lock_attr;
+
+	seed_attr.base.attr = NTDB_ATTRIBUTE_SEED;
+	seed_attr.base.next = &hash_attr;
+	seed_attr.seed.seed = 100;
+
+	hash_attr.base.attr = NTDB_ATTRIBUTE_HASH;
+	hash_attr.base.next = &lock_attr;
+	hash_attr.hash.fn = hash_fn;
+	hash_attr.hash.data = &hash_attr;
+
+	lock_attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
+	lock_attr.base.next = &tap_log_attr;
+	lock_attr.flock.lock = mylock;
+	lock_attr.flock.unlock = myunlock;
+	lock_attr.flock.data = &lock_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 50);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		union ntdb_attribute attr;
+
+		/* First open with no attributes. */
+		ntdb = ntdb_open("run-90-get-set-attributes.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
+		ok1(ntdb);
+
+		/* Get log on no attributes will fail */
+		attr.base.attr = NTDB_ATTRIBUTE_LOG;
+		ok1(ntdb_get_attribute(ntdb, &attr) == NTDB_ERR_NOEXIST);
+		/* These always work. */
+		attr.base.attr = NTDB_ATTRIBUTE_HASH;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_HASH);
+		ok1(attr.hash.fn == ntdb_jenkins_hash);
+		attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_FLOCK);
+		ok1(attr.flock.lock == ntdb_fcntl_lock);
+		ok1(attr.flock.unlock == ntdb_fcntl_unlock);
+		attr.base.attr = NTDB_ATTRIBUTE_SEED;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_SEED);
+		/* This is possible, just astronomically unlikely. */
+		ok1(attr.seed.seed != 0);
+
+		/* Unset attributes. */
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_LOG);
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_FLOCK);
+
+		/* Set them. */
+		ok1(ntdb_set_attribute(ntdb, &tap_log_attr) == 0);
+		ok1(ntdb_set_attribute(ntdb, &lock_attr) == 0);
+		/* These should fail. */
+		ok1(ntdb_set_attribute(ntdb, &seed_attr) == NTDB_ERR_EINVAL);
+		ok1(tap_log_messages == 1);
+		ok1(ntdb_set_attribute(ntdb, &hash_attr) == NTDB_ERR_EINVAL);
+		ok1(tap_log_messages == 2);
+		tap_log_messages = 0;
+
+		/* Getting them should work as expected. */
+		attr.base.attr = NTDB_ATTRIBUTE_LOG;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_LOG);
+		ok1(attr.log.fn == tap_log_attr.log.fn);
+		ok1(attr.log.data == tap_log_attr.log.data);
+
+		attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_FLOCK);
+		ok1(attr.flock.lock == mylock);
+		ok1(attr.flock.unlock == myunlock);
+		ok1(attr.flock.data == &lock_attr);
+
+		/* Unset them again. */
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_FLOCK);
+		ok1(tap_log_messages == 0);
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_LOG);
+		ok1(tap_log_messages == 0);
+
+		ntdb_close(ntdb);
+		ok1(tap_log_messages == 0);
+
+		/* Now open with all attributes. */
+		ntdb = ntdb_open("run-90-get-set-attributes.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600,
+			       &seed_attr);
+
+		ok1(ntdb);
+
+		/* Get will succeed */
+		attr.base.attr = NTDB_ATTRIBUTE_LOG;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_LOG);
+		ok1(attr.log.fn == tap_log_attr.log.fn);
+		ok1(attr.log.data == tap_log_attr.log.data);
+
+		attr.base.attr = NTDB_ATTRIBUTE_HASH;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_HASH);
+		ok1(attr.hash.fn == hash_fn);
+		ok1(attr.hash.data == &hash_attr);
+
+		attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_FLOCK);
+		ok1(attr.flock.lock == mylock);
+		ok1(attr.flock.unlock == myunlock);
+		ok1(attr.flock.data == &lock_attr);
+
+		attr.base.attr = NTDB_ATTRIBUTE_SEED;
+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
+		ok1(attr.base.attr == NTDB_ATTRIBUTE_SEED);
+		ok1(attr.seed.seed == seed_attr.seed.seed);
+
+		/* Unset attributes. */
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_HASH);
+		ok1(tap_log_messages == 1);
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_SEED);
+		ok1(tap_log_messages == 2);
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_FLOCK);
+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_LOG);
+		ok1(tap_log_messages == 2);
+		tap_log_messages = 0;
+
+		ntdb_close(ntdb);
+
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-capabilities.c b/lib/ntdb/test/run-capabilities.c
new file mode 100644
index 0000000000..c2c6aa15db
--- /dev/null
+++ b/lib/ntdb/test/run-capabilities.c
@@ -0,0 +1,271 @@
+#include <ccan/failtest/failtest_override.h>
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+#include "layout.h"
+#include "failtest_helper.h"
+#include <stdarg.h>
+
+static size_t len_of(bool breaks_check, bool breaks_write, bool breaks_open)
+{
+	size_t len = 0;
+	if (breaks_check)
+		len += 8;
+	if (breaks_write)
+		len += 16;
+	if (breaks_open)
+		len += 32;
+	return len;
+}
+
+/* Creates a NTDB with various capabilities. */
+static void create_ntdb(const char *name,
+		       unsigned int cap,
+		       bool breaks_check,
+		       bool breaks_write,
+		       bool breaks_open, ...)
+{
+	NTDB_DATA key, data;
+	va_list ap;
+	struct ntdb_layout *layout;
+	struct ntdb_context *ntdb;
+	int fd;
+
+	key = ntdb_mkdata("Hello", 5);
+	data = ntdb_mkdata("world", 5);
+
+	/* Create a NTDB with some data, and some capabilities */
+	layout = new_ntdb_layout();
+	ntdb_layout_add_freetable(layout);
+	ntdb_layout_add_used(layout, key, data, 6);
+	ntdb_layout_add_free(layout, 80, 0);
+	ntdb_layout_add_capability(layout, cap,
+				  breaks_write, breaks_check, breaks_open,
+				  len_of(breaks_check, breaks_write, breaks_open));
+
+	va_start(ap, breaks_open);
+	while ((cap = va_arg(ap, int)) != 0) {
+		breaks_check = va_arg(ap, int);
+		breaks_write = va_arg(ap, int);
+		breaks_open = va_arg(ap, int);
+
+		key.dsize--;
+		ntdb_layout_add_used(layout, key, data, 11 - key.dsize);
+		ntdb_layout_add_free(layout, 80, 0);
+		ntdb_layout_add_capability(layout, cap,
+					  breaks_write, breaks_check,
+					  breaks_open,
+					  len_of(breaks_check, breaks_write,
+						 breaks_open));
+	}
+	va_end(ap);
+
+	/* We open-code this, because we need to use the failtest write. */
+	ntdb = ntdb_layout_get(layout, failtest_free, &tap_log_attr);
+
+	fd = open(name, O_RDWR|O_TRUNC|O_CREAT, 0600);
+	if (fd < 0)
+		err(1, "opening %s for writing", name);
+	if (write(fd, ntdb->file->map_ptr, ntdb->file->map_size)
+	    != ntdb->file->map_size)
+		err(1, "writing %s", name);
+	close(fd);
+	ntdb_close(ntdb);
+	ntdb_layout_free(layout);
+}
+
+/* Note all the "goto out" early exits: they're to shorten failtest time. */
+int main(int argc, char *argv[])
+{
+	struct ntdb_context *ntdb;
+	char *summary;
+
+	failtest_init(argc, argv);
+	failtest_hook = block_repeat_failures;
+	failtest_exit_check = exit_check_log;
+	plan_tests(60);
+
+	failtest_suppress = true;
+	/* Capability says you can ignore it? */
+	create_ntdb("run-capabilities.ntdb", 1, false, false, false, 0);
+
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDWR, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	if (!ok1(ntdb))
+		goto out;
+	ok1(tap_log_messages == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+	ok1(tap_log_messages == 0);
+	ntdb_close(ntdb);
+
+	/* Two capabilitues say you can ignore them? */
+	create_ntdb("run-capabilities.ntdb",
+		   1, false, false, false,
+		   2, false, false, false, 0);
+
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDWR, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	if (!ok1(ntdb))
+		goto out;
+	ok1(tap_log_messages == 0);
+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+	ok1(tap_log_messages == 0);
+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
+	ok1(strstr(summary, "Capability 1\n"));
+	free(summary);
+	ntdb_close(ntdb);
+
+	/* Capability says you can't check. */
+	create_ntdb("run-capabilities.ntdb",
+		   1, false, false, false,
+		   2, true, false, false, 0);
+
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDWR, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	if (!ok1(ntdb))
+		goto out;
+	ok1(tap_log_messages == 0);
+	ok1(ntdb_get_flags(ntdb) & NTDB_CANT_CHECK);
+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+	/* We expect a warning! */
+	ok1(tap_log_messages == 1);
+	ok1(strstr(log_last, "capabilit"));
+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
+	ok1(strstr(summary, "Capability 1\n"));
+	ok1(strstr(summary, "Capability 2 (uncheckable)\n"));
+	free(summary);
+	ntdb_close(ntdb);
+
+	/* Capability says you can't write. */
+	create_ntdb("run-capabilities.ntdb",
+		   1, false, false, false,
+		   2, false, true, false, 0);
+
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDWR, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	/* We expect a message. */
+	ok1(!ntdb);
+	if (!ok1(tap_log_messages == 2))
+		goto out;
+	if (!ok1(strstr(log_last, "unknown")))
+		goto out;
+	ok1(strstr(log_last, "write"));
+
+	/* We can open it read-only though! */
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDONLY, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	if (!ok1(ntdb))
+		goto out;
+	ok1(tap_log_messages == 2);
+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+	ok1(tap_log_messages == 2);
+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
+	ok1(strstr(summary, "Capability 1\n"));
+	ok1(strstr(summary, "Capability 2 (read-only)\n"));
+	free(summary);
+	ntdb_close(ntdb);
+
+	/* Capability says you can't open. */
+	create_ntdb("run-capabilities.ntdb",
+		   1, false, false, false,
+		   2, false, false, true, 0);
+
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDWR, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	/* We expect a message. */
+	ok1(!ntdb);
+	if (!ok1(tap_log_messages == 3))
+		goto out;
+	if (!ok1(strstr(log_last, "unknown")))
+		goto out;
+
+	/* Combine capabilities correctly. */
+	create_ntdb("run-capabilities.ntdb",
+		   1, false, false, false,
+		   2, true, false, false,
+		   3, false, true, false, 0);
+
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDWR, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	/* We expect a message. */
+	ok1(!ntdb);
+	if (!ok1(tap_log_messages == 4))
+		goto out;
+	if (!ok1(strstr(log_last, "unknown")))
+		goto out;
+	ok1(strstr(log_last, "write"));
+
+	/* We can open it read-only though! */
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDONLY, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	if (!ok1(ntdb))
+		goto out;
+	ok1(tap_log_messages == 4);
+	ok1(ntdb_get_flags(ntdb) & NTDB_CANT_CHECK);
+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+	/* We expect a warning! */
+	ok1(tap_log_messages == 5);
+	ok1(strstr(log_last, "unknown"));
+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
+	ok1(strstr(summary, "Capability 1\n"));
+	ok1(strstr(summary, "Capability 2 (uncheckable)\n"));
+	ok1(strstr(summary, "Capability 3 (read-only)\n"));
+	free(summary);
+	ntdb_close(ntdb);
+
+	/* Two capability flags in one. */
+	create_ntdb("run-capabilities.ntdb",
+		   1, false, false, false,
+		   2, true, true, false,
+		   0);
+
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDWR, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	/* We expect a message. */
+	ok1(!ntdb);
+	if (!ok1(tap_log_messages == 6))
+		goto out;
+	if (!ok1(strstr(log_last, "unknown")))
+		goto out;
+	ok1(strstr(log_last, "write"));
+
+	/* We can open it read-only though! */
+	failtest_suppress = false;
+	ntdb = ntdb_open("run-capabilities.ntdb", NTDB_DEFAULT, O_RDONLY, 0,
+		       &tap_log_attr);
+	failtest_suppress = true;
+	if (!ok1(ntdb))
+		goto out;
+	ok1(tap_log_messages == 6);
+	ok1(ntdb_get_flags(ntdb) & NTDB_CANT_CHECK);
+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
+	/* We expect a warning! */
+	ok1(tap_log_messages == 7);
+	ok1(strstr(log_last, "unknown"));
+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
+	ok1(strstr(summary, "Capability 1\n"));
+	ok1(strstr(summary, "Capability 2 (uncheckable,read-only)\n"));
+	free(summary);
+	ntdb_close(ntdb);
+
+out:
+	failtest_exit(exit_status());
+}
diff --git a/lib/ntdb/test/run-expand-in-transaction.c b/lib/ntdb/test/run-expand-in-transaction.c
new file mode 100644
index 0000000000..dadbec7922
--- /dev/null
+++ b/lib/ntdb/test/run-expand-in-transaction.c
@@ -0,0 +1,36 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = ntdb_mkdata("key", 3);
+	NTDB_DATA data = ntdb_mkdata("data", 4);
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		size_t size;
+		ntdb = ntdb_open("run-expand-in-transaction.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		size = ntdb->file->map_size;
+		ok1(ntdb_transaction_start(ntdb) == 0);
+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
+		ok1(ntdb->file->map_size > size);
+		ok1(ntdb_transaction_commit(ntdb) == 0);
+		ok1(ntdb->file->map_size > size);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-features.c b/lib/ntdb/test/run-features.c
new file mode 100644
index 0000000000..0d6b3bce76
--- /dev/null
+++ b/lib/ntdb/test/run-features.c
@@ -0,0 +1,62 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct ntdb_context *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
+	NTDB_DATA data = { (unsigned char *)&j, sizeof(j) };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		uint64_t features;
+		ntdb = ntdb_open("run-features.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		/* Put some stuff in there. */
+		for (j = 0; j < 100; j++) {
+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+				fail("Storing in ntdb");
+		}
+
+		/* Mess with features fields in hdr. */
+		features = (~NTDB_FEATURE_MASK ^ 1);
+		ok1(ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
+						    features_used),
+				      &features, sizeof(features)) == 0);
+		ok1(ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
+						    features_offered),
+				      &features, sizeof(features)) == 0);
+		ntdb_close(ntdb);
+
+		ntdb = ntdb_open("run-features.ntdb", flags[i], O_RDWR, 0,
+			       &tap_log_attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		/* Should not have changed features offered. */
+		ok1(ntdb_read_convert(ntdb, offsetof(struct ntdb_header,
+						   features_offered),
+				     &features, sizeof(features)) == 0);
+		ok1(features == (~NTDB_FEATURE_MASK ^ 1));
+
+		/* Should have cleared unknown bits in features_used. */
+		ok1(ntdb_read_convert(ntdb, offsetof(struct ntdb_header,
+						   features_used),
+				     &features, sizeof(features)) == 0);
+		ok1(features == (1 & NTDB_FEATURE_MASK));
+
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-lockall.c b/lib/ntdb/test/run-lockall.c
new file mode 100644
index 0000000000..964164e20b
--- /dev/null
+++ b/lib/ntdb/test/run-lockall.c
@@ -0,0 +1,70 @@
+#include "private.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+
+#define fcntl fcntl_with_lockcheck
+#include "ntdb-source.h"
+
+#include "tap-interface.h"
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include "external-agent.h"
+#include "logging.h"
+
+#define TEST_DBNAME "run-lockall.ntdb"
+
+#undef fcntl
+
+int main(int argc, char *argv[])
+{
+	struct agent *agent;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+	int i;
+
+	plan_tests(13 * sizeof(flags)/sizeof(flags[0]) + 1);
+	agent = prepare_external_agent();
+	if (!agent)
+		err(1, "preparing agent");
+
+	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
+		enum agent_return ret;
+		struct ntdb_context *ntdb;
+
+		ntdb = ntdb_open(TEST_DBNAME, flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ok1(ntdb);
+
+		ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+		ok1(ret == SUCCESS);
+
+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
+		ok1(external_agent_operation(agent, STORE, "key")
+		    == WOULD_HAVE_BLOCKED);
+		ok1(external_agent_operation(agent, FETCH, "key")
+		    == WOULD_HAVE_BLOCKED);
+		/* Test nesting. */
+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
+		ntdb_unlockall(ntdb);
+		ntdb_unlockall(ntdb);
+
+		ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
+
+		ok1(ntdb_lockall_read(ntdb) == NTDB_SUCCESS);
+		ok1(external_agent_operation(agent, STORE, "key")
+		    == WOULD_HAVE_BLOCKED);
+		ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
+		ok1(ntdb_lockall_read(ntdb) == NTDB_SUCCESS);
+		ntdb_unlockall_read(ntdb);
+		ntdb_unlockall_read(ntdb);
+
+		ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
+		ok1(external_agent_operation(agent, CLOSE, NULL) == SUCCESS);
+		ntdb_close(ntdb);
+	}
+
+	free_external_agent(agent);
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-remap-in-read_traverse.c b/lib/ntdb/test/run-remap-in-read_traverse.c
new file mode 100644
index 0000000000..2d817c2d73
--- /dev/null
+++ b/lib/ntdb/test/run-remap-in-read_traverse.c
@@ -0,0 +1,57 @@
+#include "ntdb-source.h"
+/* We had a bug where we marked the ntdb read-only for a ntdb_traverse_read.
+ * If we then expanded the ntdb, we would remap read-only, and later SEGV. */
+#include "tap-interface.h"
+#include "external-agent.h"
+#include "logging.h"
+
+static bool file_larger(int fd, ntdb_len_t size)
+{
+	struct stat st;
+
+	fstat(fd, &st);
+	return st.st_size != size;
+}
+
+static unsigned add_records_to_grow(struct agent *agent, int fd, ntdb_len_t size)
+{
+	unsigned int i;
+
+	for (i = 0; !file_larger(fd, size); i++) {
+		char data[20];
+		sprintf(data, "%i", i);
+		if (external_agent_operation(agent, STORE, data) != SUCCESS)
+			return 0;
+	}
+	diag("Added %u records to grow file", i);
+	return i;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct agent *agent;
+	struct ntdb_context *ntdb;
+	NTDB_DATA d = ntdb_mkdata("hello", 5);
+	const char filename[] = "run-remap-in-read_traverse.ntdb";
+
+	plan_tests(4);
+
+	agent = prepare_external_agent();
+
+	ntdb = ntdb_open(filename, NTDB_DEFAULT,
+		       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+
+	ok1(external_agent_operation(agent, OPEN, filename) == SUCCESS);
+	i = add_records_to_grow(agent, ntdb->file->fd, ntdb->file->map_size);
+
+	/* Do a traverse. */
+	ok1(ntdb_traverse(ntdb, NULL, NULL) == i);
+
+	/* Now store something! */
+	ok1(ntdb_store(ntdb, d, d, NTDB_INSERT) == 0);
+	ok1(tap_log_messages == 0);
+	ntdb_close(ntdb);
+	free_external_agent(agent);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-seed.c b/lib/ntdb/test/run-seed.c
new file mode 100644
index 0000000000..2514f728ac
--- /dev/null
+++ b/lib/ntdb/test/run-seed.c
@@ -0,0 +1,61 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+static int log_count = 0;
+
+/* Normally we get a log when setting random seed. */
+static void my_log_fn(struct ntdb_context *ntdb,
+		      enum ntdb_log_level level,
+		      enum NTDB_ERROR ecode,
+		      const char *message, void *priv)
+{
+	log_count++;
+}
+
+static union ntdb_attribute log_attr = {
+	.log = { .base = { .attr = NTDB_ATTRIBUTE_LOG },
+		 .fn = my_log_fn }
+};
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct ntdb_context *ntdb;
+	union ntdb_attribute attr;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+
+	attr.seed.base.attr = NTDB_ATTRIBUTE_SEED;
+	attr.seed.base.next = &log_attr;
+	attr.seed.seed = 42;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 4 * 3);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct ntdb_header hdr;
+		int fd;
+		ntdb = ntdb_open("run-seed.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &attr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(ntdb->hash_seed == 42);
+		ok1(log_count == 0);
+		ntdb_close(ntdb);
+
+		if (flags[i] & NTDB_INTERNAL)
+			continue;
+
+		fd = open("run-seed.ntdb", O_RDONLY);
+		ok1(fd >= 0);
+		ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr));
+		if (flags[i] & NTDB_CONVERT)
+			ok1(bswap_64(hdr.hash_seed) == 42);
+		else
+			ok1(hdr.hash_seed == 42);
+		close(fd);
+	}
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-tdb_errorstr.c b/lib/ntdb/test/run-tdb_errorstr.c
new file mode 100644
index 0000000000..5b023140a7
--- /dev/null
+++ b/lib/ntdb/test/run-tdb_errorstr.c
@@ -0,0 +1,52 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+
+int main(int argc, char *argv[])
+{
+	enum NTDB_ERROR e;
+	plan_tests(NTDB_ERR_RDONLY*-1 + 2);
+
+	for (e = NTDB_SUCCESS; e >= NTDB_ERR_RDONLY; e--) {
+		switch (e) {
+		case NTDB_SUCCESS:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "Success"));
+			break;
+		case NTDB_ERR_IO:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "IO Error"));
+			break;
+		case NTDB_ERR_LOCK:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "Locking error"));
+			break;
+		case NTDB_ERR_OOM:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "Out of memory"));
+			break;
+		case NTDB_ERR_EXISTS:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "Record exists"));
+			break;
+		case NTDB_ERR_EINVAL:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "Invalid parameter"));
+			break;
+		case NTDB_ERR_NOEXIST:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "Record does not exist"));
+			break;
+		case NTDB_ERR_RDONLY:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "write not permitted"));
+			break;
+		case NTDB_ERR_CORRUPT:
+			ok1(!strcmp(ntdb_errorstr(e),
+				    "Corrupt database"));
+			break;
+		}
+	}
+	ok1(!strcmp(ntdb_errorstr(e), "Invalid error code"));
+
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-tdb_foreach.c b/lib/ntdb/test/run-tdb_foreach.c
new file mode 100644
index 0000000000..f1a2d00919
--- /dev/null
+++ b/lib/ntdb/test/run-tdb_foreach.c
@@ -0,0 +1,86 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+static int drop_count(struct ntdb_context *ntdb, unsigned int *count)
+{
+	if (--(*count) == 0)
+		return 1;
+	return 0;
+}
+
+static int set_found(struct ntdb_context *ntdb, bool found[3])
+{
+	unsigned int idx;
+
+	if (strcmp(ntdb_name(ntdb), "run-ntdb_foreach0.ntdb") == 0)
+		idx = 0;
+	else if (strcmp(ntdb_name(ntdb), "run-ntdb_foreach1.ntdb") == 0)
+		idx = 1;
+	else if (strcmp(ntdb_name(ntdb), "run-ntdb_foreach2.ntdb") == 0)
+		idx = 2;
+	else
+		abort();
+
+	if (found[idx])
+		abort();
+	found[idx] = true;
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, count;
+	bool found[3];
+	struct ntdb_context *ntdb0, *ntdb1, *ntdb;
+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb0 = ntdb_open("run-ntdb_foreach0.ntdb", flags[i],
+				O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ntdb1 = ntdb_open("run-ntdb_foreach1.ntdb", flags[i],
+				O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+		ntdb = ntdb_open("run-ntdb_foreach2.ntdb", flags[i],
+				O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+
+		memset(found, 0, sizeof(found));
+		ntdb_foreach(set_found, found);
+		ok1(found[0] && found[1] && found[2]);
+
+		/* Test premature iteration termination */
+		count = 1;
+		ntdb_foreach(drop_count, &count);
+		ok1(count == 0);
+
+		ntdb_close(ntdb1);
+		memset(found, 0, sizeof(found));
+		ntdb_foreach(set_found, found);
+		ok1(found[0] && !found[1] && found[2]);
+
+		ntdb_close(ntdb);
+		memset(found, 0, sizeof(found));
+		ntdb_foreach(set_found, found);
+		ok1(found[0] && !found[1] && !found[2]);
+
+		ntdb1 = ntdb_open("run-ntdb_foreach1.ntdb", flags[i],
+				O_RDWR, 0600, &tap_log_attr);
+		memset(found, 0, sizeof(found));
+		ntdb_foreach(set_found, found);
+		ok1(found[0] && found[1] && !found[2]);
+
+		ntdb_close(ntdb0);
+		memset(found, 0, sizeof(found));
+		ntdb_foreach(set_found, found);
+		ok1(!found[0] && found[1] && !found[2]);
+
+		ntdb_close(ntdb1);
+		memset(found, 0, sizeof(found));
+		ntdb_foreach(set_found, found);
+		ok1(!found[0] && !found[1] && !found[2]);
+		ok1(tap_log_messages == 0);
+	}
+
+	return exit_status();
+}
diff --git a/lib/ntdb/test/run-traverse.c b/lib/ntdb/test/run-traverse.c
new file mode 100644
index 0000000000..9dfc94d3b3
--- /dev/null
+++ b/lib/ntdb/test/run-traverse.c
@@ -0,0 +1,203 @@
+#include "ntdb-source.h"
+#include "tap-interface.h"
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+	return hash64_stable((const unsigned char *)key, len,
+			     *(uint64_t *)p);
+}
+
+static bool store_records(struct ntdb_context *ntdb)
+{
+	int i;
+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
+
+	for (i = 0; i < NUM_RECORDS; i++)
+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
+			return false;
+	return true;
+}
+
+struct trav_data {
+	unsigned int calls, call_limit;
+	int low, high;
+	bool mismatch;
+	bool delete;
+	enum NTDB_ERROR delete_error;
+};
+
+static int trav(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf,
+		struct trav_data *td)
+{
+	int val;
+
+	td->calls++;
+	if (key.dsize != sizeof(val) || dbuf.dsize != sizeof(val)
+	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
+		td->mismatch = true;
+		return -1;
+	}
+	memcpy(&val, dbuf.dptr, dbuf.dsize);
+	if (val < td->low)
+		td->low = val;
+	if (val > td->high)
+		td->high = val;
+
+	if (td->delete) {
+		td->delete_error = ntdb_delete(ntdb, key);
+		if (td->delete_error != NTDB_SUCCESS) {
+			return -1;
+		}
+	}
+
+	if (td->calls == td->call_limit)
+		return 1;
+	return 0;
+}
+
+struct trav_grow_data {
+	unsigned int calls;
+	unsigned int num_large;
+	bool mismatch;
+	enum NTDB_ERROR error;
+};
+
+static int trav_grow(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf,
+		     struct trav_grow_data *tgd)
+{
+	int val;
+	unsigned char buffer[128] = { 0 };
+
+	tgd->calls++;
+	if (key.dsize != sizeof(val) || dbuf.dsize < sizeof(val)
+	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
+		tgd->mismatch = true;
+		return -1;
+	}
+
+	if (dbuf.dsize > sizeof(val))
+		/* We must have seen this before! */
+		tgd->num_large++;
+
+	/* Make a big difference to the database. */
+	dbuf.dptr = buffer;
+	dbuf.dsize = sizeof(buffer);
+	tgd->error = ntdb_append(ntdb, key, dbuf);
+	if (tgd->error != NTDB_SUCCESS) {
+		return -1;
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	int num;
+	struct trav_data td;
+	struct trav_grow_data tgd;
+	struct ntdb_context *ntdb;
+	uint64_t seed = 16014841315512641303ULL;
+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
+			NTDB_NOMMAP|NTDB_CONVERT };
+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
+						.fn = fixedhash,
+						.data = &seed } };
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 32 + 1);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		ntdb = ntdb_open("run-traverse.ntdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(ntdb);
+		if (!ntdb)
+			continue;
+
+		ok1(ntdb_traverse(ntdb, NULL, NULL) == 0);
+
+		ok1(store_records(ntdb));
+		num = ntdb_traverse(ntdb, NULL, NULL);
+		ok1(num == NUM_RECORDS);
+
+		/* Full traverse. */
+		td.calls = 0;
+		td.call_limit = UINT_MAX;
+		td.low = INT_MAX;
+		td.high = INT_MIN;
+		td.mismatch = false;
+		td.delete = false;
+
+		num = ntdb_traverse(ntdb, trav, &td);
+		ok1(num == NUM_RECORDS);
+		ok1(!td.mismatch);
+		ok1(td.calls == NUM_RECORDS);
+		ok1(td.low == 0);
+		ok1(td.high == NUM_RECORDS-1);
+
+		/* Short traverse. */
+		td.calls = 0;
+		td.call_limit = NUM_RECORDS / 2;
+		td.low = INT_MAX;
+		td.high = INT_MIN;
+		td.mismatch = false;
+		td.delete = false;
+
+		num = ntdb_traverse(ntdb, trav, &td);
+		ok1(num == NUM_RECORDS / 2);
+		ok1(!td.mismatch);
+		ok1(td.calls == NUM_RECORDS / 2);
+		ok1(td.low <= NUM_RECORDS / 2);
+		ok1(td.high > NUM_RECORDS / 2);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(tap_log_messages == 0);
+
+		/* Deleting traverse (delete everything). */
+		td.calls = 0;
+		td.call_limit = UINT_MAX;
+		td.low = INT_MAX;
+		td.high = INT_MIN;
+		td.mismatch = false;
+		td.delete = true;
+		td.delete_error = NTDB_SUCCESS;
+		num = ntdb_traverse(ntdb, trav, &td);
+		ok1(num == NUM_RECORDS);
+		ok1(td.delete_error == NTDB_SUCCESS);
+		ok1(!td.mismatch);
+		ok1(td.calls == NUM_RECORDS);
+		ok1(td.low == 0);
+		ok1(td.high == NUM_RECORDS - 1);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Now it's empty! */
+		ok1(ntdb_traverse(ntdb, NULL, NULL) == 0);
+
+		/* Re-add. */
+		ok1(store_records(ntdb));
+		ok1(ntdb_traverse(ntdb, NULL, NULL) == NUM_RECORDS);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+
+		/* Grow.  This will cause us to be reshuffled. */
+		tgd.calls = 0;
+		tgd.num_large = 0;
+		tgd.mismatch = false;
+		tgd.error = NTDB_SUCCESS;
+		ok1(ntdb_traverse(ntdb, trav_grow, &tgd) > 1);
+		ok1(tgd.error == 0);
+		ok1(!tgd.mismatch);
+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
+		ok1(tgd.num_large < tgd.calls);
+		diag("growing db: %u calls, %u repeats",
+		     tgd.calls, tgd.num_large);
+
+		ntdb_close(ntdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}
diff --git a/lib/ntdb/test/tap-interface.c b/lib/ntdb/test/tap-interface.c
new file mode 100644
index 0000000000..077ec2cd9a
--- /dev/null
+++ b/lib/ntdb/test/tap-interface.c
@@ -0,0 +1,3 @@
+#include "tap-interface.h"
+
+unsigned tap_ok_count, tap_ok_target = -1U;
diff --git a/lib/ntdb/test/tap-interface.h b/lib/ntdb/test/tap-interface.h
new file mode 100644
index 0000000000..f3d4ec2545
--- /dev/null
+++ b/lib/ntdb/test/tap-interface.h
@@ -0,0 +1,41 @@
+/*
+   Unix SMB/CIFS implementation.
+   Simplistic implementation of tap interface.
+
+   Copyright (C) Rusty Russell 2012
+
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include <stdio.h>
+#include <ccan/err/err.h>
+
+#ifndef __location__
+#define __TAP_STRING_LINE1__(s)    #s
+#define __TAP_STRING_LINE2__(s)   __TAP_STRING_LINE1__(s)
+#define __TAP_STRING_LINE3__  __TAP_STRING_LINE2__(__LINE__)
+#define __location__ __FILE__ ":" __TAP_STRING_LINE3__
+#endif
+
+extern unsigned tap_ok_count, tap_ok_target;
+#define plan_tests(num) do { tap_ok_target = (num); } while(0)
+#define ok(e, ...) ((e) ? (printf("."), tap_ok_count++, true) : (warnx(__VA_ARGS__), false))
+#define ok1(e) ok((e), "%s:%s", __location__, #e)
+#define pass(...) (printf("."), tap_ok_count++)
+#define fail(...) warnx(__VA_ARGS__)
+#define diag printf
+#define exit_status() (tap_ok_count == tap_ok_target ? 0 : 1)
diff --git a/lib/ntdb/tools/Makefile b/lib/ntdb/tools/Makefile
new file mode 100644
index 0000000000..087c256d7f
--- /dev/null
+++ b/lib/ntdb/tools/Makefile
@@ -0,0 +1,16 @@
+OBJS:=../../ntdb.o ../../hash.o ../../tally.o
+CFLAGS:=-I../../.. -I.. -Wall -g -O3 #-g -pg
+LDFLAGS:=-L../../..
+
+default: ntdbtorture ntdbtool ntdbdump ntdbrestore mkntdb speed growtdb-bench
+
+ntdbdump: ntdbdump.c $(OBJS)
+ntdbrestore: ntdbrestore.c $(OBJS)
+ntdbtorture: ntdbtorture.c $(OBJS)
+ntdbtool: ntdbtool.c $(OBJS)
+mkntdb: mkntdb.c $(OBJS)
+speed: speed.c $(OBJS)
+growtdb-bench: growtdb-bench.c $(OBJS)
+
+clean:
+	rm -f ntdbtorture ntdbdump ntdbrestore ntdbtool mkntdb speed growtdb-bench
diff --git a/lib/ntdb/tools/growtdb-bench.c b/lib/ntdb/tools/growtdb-bench.c
new file mode 100644
index 0000000000..640f87af5a
--- /dev/null
+++ b/lib/ntdb/tools/growtdb-bench.c
@@ -0,0 +1,114 @@
+#include "ntdb.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <ccan/err/err.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static void logfn(struct ntdb_context *ntdb,
+		  enum ntdb_log_level level,
+		  enum NTDB_ERROR ecode,
+		  const char *message,
+		  void *data)
+{
+	fprintf(stderr, "ntdb:%s:%s:%s\n",
+		ntdb_name(ntdb), ntdb_errorstr(ecode), message);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, users, groups;
+	NTDB_DATA idxkey, idxdata;
+	NTDB_DATA k, d, gk;
+	char cmd[100];
+	struct ntdb_context *ntdb;
+	enum NTDB_ERROR ecode;
+	union ntdb_attribute log;
+
+	if (argc != 3) {
+		printf("Usage: growtdb-bench <users> <groups>\n");
+		exit(1);
+	}
+	users = atoi(argv[1]);
+	groups = atoi(argv[2]);
+
+	sprintf(cmd, "cat /proc/%i/statm", getpid());
+
+	log.base.attr = NTDB_ATTRIBUTE_LOG;
+	log.base.next = NULL;
+	log.log.fn = logfn;
+
+	ntdb = ntdb_open("/tmp/growtdb.ntdb", NTDB_DEFAULT,
+		       O_RDWR|O_CREAT|O_TRUNC, 0600, &log);
+
+	idxkey.dptr = (unsigned char *)"User index";
+	idxkey.dsize = strlen("User index");
+	idxdata.dsize = 51;
+	idxdata.dptr = calloc(idxdata.dsize, 1);
+
+	/* Create users. */
+	k.dsize = 48;
+	k.dptr = calloc(k.dsize, 1);
+	d.dsize = 64;
+	d.dptr = calloc(d.dsize, 1);
+
+	ntdb_transaction_start(ntdb);
+	for (i = 0; i < users; i++) {
+		memcpy(k.dptr, &i, sizeof(i));
+		ecode = ntdb_store(ntdb, k, d, NTDB_INSERT);
+		if (ecode != NTDB_SUCCESS)
+			errx(1, "ntdb insert failed: %s", ntdb_errorstr(ecode));
+
+		/* This simulates a growing index record. */
+		ecode = ntdb_append(ntdb, idxkey, idxdata);
+		if (ecode != NTDB_SUCCESS)
+			errx(1, "ntdb append failed: %s", ntdb_errorstr(ecode));
+	}
+	if ((ecode = ntdb_transaction_commit(ntdb)) != 0)
+		errx(1, "ntdb commit1 failed: %s", ntdb_errorstr(ecode));
+
+	if ((ecode = ntdb_check(ntdb, NULL, NULL)) != 0)
+		errx(1, "ntdb_check failed after initial insert!");
+
+	system(cmd);
+
+	/* Now put them all in groups: add 32 bytes to each record for
+	 * a group. */
+	gk.dsize = 48;
+	gk.dptr = calloc(k.dsize, 1);
+	gk.dptr[gk.dsize-1] = 1;
+
+	d.dsize = 32;
+	for (i = 0; i < groups; i++) {
+		ntdb_transaction_start(ntdb);
+		/* Create the "group". */
+		memcpy(gk.dptr, &i, sizeof(i));
+		ecode = ntdb_store(ntdb, gk, d, NTDB_INSERT);
+		if (ecode != NTDB_SUCCESS)
+			errx(1, "ntdb insert failed: %s", ntdb_errorstr(ecode));
+
+		/* Now populate it. */
+		for (j = 0; j < users; j++) {
+			/* Append to the user. */
+			memcpy(k.dptr, &j, sizeof(j));
+			if ((ecode = ntdb_append(ntdb, k, d)) != 0)
+				errx(1, "ntdb append failed: %s",
+				     ntdb_errorstr(ecode));
+
+			/* Append to the group. */
+			if ((ecode = ntdb_append(ntdb, gk, d)) != 0)
+				errx(1, "ntdb append failed: %s",
+				     ntdb_errorstr(ecode));
+		}
+		if ((ecode = ntdb_transaction_commit(ntdb)) != 0)
+			errx(1, "ntdb commit2 failed: %s", ntdb_errorstr(ecode));
+		if ((ecode = ntdb_check(ntdb, NULL, NULL)) != 0)
+			errx(1, "ntdb_check failed after iteration %i!", i);
+		system(cmd);
+	}
+
+	return 0;
+}
diff --git a/lib/ntdb/tools/mkntdb.c b/lib/ntdb/tools/mkntdb.c
new file mode 100644
index 0000000000..e728987a53
--- /dev/null
+++ b/lib/ntdb/tools/mkntdb.c
@@ -0,0 +1,29 @@
+#include "ntdb.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <ccan/err/err.h>
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, num_recs;
+	struct ntdb_context *ntdb;
+
+	if (argc != 3 || (num_recs = atoi(argv[2])) == 0)
+		errx(1, "Usage: mktdb <tdbfile> <numrecords>");
+
+	ntdb = ntdb_open(argv[1], NTDB_DEFAULT, O_CREAT|O_TRUNC|O_RDWR, 0600,NULL);
+	if (!ntdb)
+		err(1, "Opening %s", argv[1]);
+
+	for (i = 0; i < num_recs; i++) {
+		NTDB_DATA d;
+
+		d.dptr = (void *)&i;
+		d.dsize = sizeof(i);
+		if (ntdb_store(ntdb, d, d, NTDB_INSERT) != 0)
+			err(1, "Failed to store record %i", i);
+	}
+	printf("Done\n");
+	return 0;
+}
diff --git a/lib/ntdb/tools/ntdbbackup.c b/lib/ntdb/tools/ntdbbackup.c
new file mode 100644
index 0000000000..a76f18491b
--- /dev/null
+++ b/lib/ntdb/tools/ntdbbackup.c
@@ -0,0 +1,340 @@
+/*
+   Unix SMB/CIFS implementation.
+   low level ntdb backup and restore utility
+   Copyright (C) Andrew Tridgell              2002
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+
+  This program is meant for backup/restore of ntdb databases. Typical usage would be:
+     tdbbackup *.ntdb
+  when Samba shuts down cleanly, which will make a backup of all the local databases
+  to *.bak files. Then on Samba startup you would use:
+     tdbbackup -v *.ntdb
+  and this will check the databases for corruption and if corruption is detected then
+  the backup will be restored.
+
+  You may also like to do a backup on a regular basis while Samba is
+  running, perhaps using cron.
+
+  The reason this program is needed is to cope with power failures
+  while Samba is running. A power failure could lead to database
+  corruption and Samba will then not start correctly.
+
+  Note that many of the databases in Samba are transient and thus
+  don't need to be backed up, so you can optimise the above a little
+  by only running the backup on the critical databases.
+
+ */
+
+#include "config.h"
+#include "ntdb.h"
+#include "system/filesys.h"
+
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+
+static int failed;
+
+static void ntdb_log(struct ntdb_context *ntdb,
+		    enum ntdb_log_level level,
+		    enum NTDB_ERROR ecode,
+		    const char *message,
+		    void *data)
+{
+	fprintf(stderr, "%s:%s\n", ntdb_errorstr(ecode), message);
+}
+
+static char *add_suffix(const char *name, const char *suffix)
+{
+	char *ret;
+	int len = strlen(name) + strlen(suffix) + 1;
+	ret = (char *)malloc(len);
+	if (!ret) {
+		fprintf(stderr,"Out of memory!\n");
+		exit(1);
+	}
+	snprintf(ret, len, "%s%s", name, suffix);
+	return ret;
+}
+
+static int copy_fn(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state)
+{
+	struct ntdb_context *ntdb_new = (struct ntdb_context *)state;
+	enum NTDB_ERROR err;
+
+	err = ntdb_store(ntdb_new, key, dbuf, NTDB_INSERT);
+	if (err) {
+		fprintf(stderr,"Failed to insert into %s: %s\n",
+			ntdb_name(ntdb_new), ntdb_errorstr(err));
+		failed = 1;
+		return 1;
+	}
+	return 0;
+}
+
+
+static int test_fn(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state)
+{
+	return 0;
+}
+
+/*
+  carefully backup a ntdb, validating the contents and
+  only doing the backup if its OK
+  this function is also used for restore
+*/
+static int backup_ntdb(const char *old_name, const char *new_name)
+{
+	struct ntdb_context *ntdb;
+	struct ntdb_context *ntdb_new;
+	char *tmp_name;
+	struct stat st;
+	int count1, count2;
+	enum NTDB_ERROR err;
+	union ntdb_attribute log_attr;
+
+	tmp_name = add_suffix(new_name, ".tmp");
+
+	/* stat the old ntdb to find its permissions */
+	if (stat(old_name, &st) != 0) {
+		perror(old_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	log_attr.base.attr = NTDB_ATTRIBUTE_LOG;
+	log_attr.base.next = NULL;
+	log_attr.log.fn = ntdb_log;
+
+	/* open the old ntdb */
+	ntdb = ntdb_open(old_name, NTDB_DEFAULT, O_RDWR, 0, &log_attr);
+	if (!ntdb) {
+		printf("Failed to open %s\n", old_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	unlink(tmp_name);
+	ntdb_new = ntdb_open(tmp_name, NTDB_DEFAULT,
+			   O_RDWR|O_CREAT|O_EXCL, st.st_mode & 0777,
+			   &log_attr);
+	if (!ntdb_new) {
+		perror(tmp_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	err = ntdb_transaction_start(ntdb);
+	if (err) {
+		fprintf(stderr, "Failed to start transaction on old ntdb: %s\n",
+			ntdb_errorstr(err));
+		ntdb_close(ntdb);
+		ntdb_close(ntdb_new);
+		unlink(tmp_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	/* lock the backup ntdb so that nobody else can change it */
+	err = ntdb_lockall(ntdb_new);
+	if (err) {
+		fprintf(stderr, "Failed to lock backup ntdb: %s\n",
+			ntdb_errorstr(err));
+		ntdb_close(ntdb);
+		ntdb_close(ntdb_new);
+		unlink(tmp_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	failed = 0;
+
+	/* traverse and copy */
+	count1 = ntdb_traverse(ntdb, copy_fn, (void *)ntdb_new);
+	if (count1 < 0 || failed) {
+		fprintf(stderr,"failed to copy %s\n", old_name);
+		ntdb_close(ntdb);
+		ntdb_close(ntdb_new);
+		unlink(tmp_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	/* close the old ntdb */
+	ntdb_close(ntdb);
+
+	/* copy done, unlock the backup ntdb */
+	ntdb_unlockall(ntdb_new);
+
+#ifdef HAVE_FDATASYNC
+	if (fdatasync(ntdb_fd(ntdb_new)) != 0) {
+#else
+	if (fsync(ntdb_fd(ntdb_new)) != 0) {
+#endif
+		/* not fatal */
+		fprintf(stderr, "failed to fsync backup file\n");
+	}
+
+	/* close the new ntdb and re-open read-only */
+	ntdb_close(ntdb_new);
+
+	/* we don't need the hash attr any more */
+	log_attr.base.next = NULL;
+
+	ntdb_new = ntdb_open(tmp_name, NTDB_DEFAULT, O_RDONLY, 0, &log_attr);
+	if (!ntdb_new) {
+		fprintf(stderr,"failed to reopen %s\n", tmp_name);
+		unlink(tmp_name);
+		perror(tmp_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	/* traverse the new ntdb to confirm */
+	count2 = ntdb_traverse(ntdb_new, test_fn, NULL);
+	if (count2 != count1) {
+		fprintf(stderr,"failed to copy %s\n", old_name);
+		ntdb_close(ntdb_new);
+		unlink(tmp_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	/* close the new ntdb and rename it to .bak */
+	ntdb_close(ntdb_new);
+	if (rename(tmp_name, new_name) != 0) {
+		perror(new_name);
+		free(tmp_name);
+		return 1;
+	}
+
+	free(tmp_name);
+
+	return 0;
+}
+
+/*
+  verify a ntdb and if it is corrupt then restore from *.bak
+*/
+static int verify_ntdb(const char *fname, const char *bak_name)
+{
+	struct ntdb_context *ntdb;
+	int count = -1;
+	union ntdb_attribute log_attr;
+
+	log_attr.base.attr = NTDB_ATTRIBUTE_LOG;
+	log_attr.base.next = NULL;
+	log_attr.log.fn = ntdb_log;
+
+	/* open the ntdb */
+	ntdb = ntdb_open(fname, NTDB_DEFAULT, O_RDONLY, 0, &log_attr);
+
+	/* traverse the ntdb, then close it */
+	if (ntdb) {
+		count = ntdb_traverse(ntdb, test_fn, NULL);
+		ntdb_close(ntdb);
+	}
+
+	/* count is < 0 means an error */
+	if (count < 0) {
+		printf("restoring %s\n", fname);
+		return backup_ntdb(bak_name, fname);
+	}
+
+	printf("%s : %d records\n", fname, count);
+
+	return 0;
+}
+
+/*
+  see if one file is newer than another
+*/
+static int file_newer(const char *fname1, const char *fname2)
+{
+	struct stat st1, st2;
+	if (stat(fname1, &st1) != 0) {
+		return 0;
+	}
+	if (stat(fname2, &st2) != 0) {
+		return 1;
+	}
+	return (st1.st_mtime > st2.st_mtime);
+}
+
+static void usage(void)
+{
+	printf("Usage: ntdbbackup [options] <fname...>\n\n");
+	printf("   -h            this help message\n");
+	printf("   -v            verify mode (restore if corrupt)\n");
+	printf("   -s suffix     set the backup suffix\n");
+	printf("   -v            verify mode (restore if corrupt)\n");
+}
+
+
+ int main(int argc, char *argv[])
+{
+	int i;
+	int ret = 0;
+	int c;
+	int verify = 0;
+	const char *suffix = ".bak";
+
+	while ((c = getopt(argc, argv, "vhs:")) != -1) {
+		switch (c) {
+		case 'h':
+			usage();
+			exit(0);
+		case 'v':
+			verify = 1;
+			break;
+		case 's':
+			suffix = optarg;
+			break;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		usage();
+		exit(1);
+	}
+
+	for (i=0; i<argc; i++) {
+		const char *fname = argv[i];
+		char *bak_name;
+
+		bak_name = add_suffix(fname, suffix);
+
+		if (verify) {
+			if (verify_ntdb(fname, bak_name) != 0) {
+				ret = 1;
+			}
+		} else {
+			if (file_newer(fname, bak_name) &&
+			    backup_ntdb(fname, bak_name) != 0) {
+				ret = 1;
+			}
+		}
+
+		free(bak_name);
+	}
+
+	return ret;
+}
diff --git a/lib/ntdb/tools/ntdbdump.c b/lib/ntdb/tools/ntdbdump.c
new file mode 100644
index 0000000000..1b1c59eae3
--- /dev/null
+++ b/lib/ntdb/tools/ntdbdump.c
@@ -0,0 +1,122 @@
+/*
+   simple ntdb dump util
+   Copyright (C) Andrew Tridgell              2001
+   Copyright (C) Rusty Russell                2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "config.h"
+#include "ntdb.h"
+#ifdef HAVE_LIBREPLACE
+#include <replace.h>
+#include <system/filesys.h>
+#include <system/locale.h>
+#else
+#include <ctype.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#endif
+
+static void print_data(NTDB_DATA d)
+{
+	unsigned char *p = (unsigned char *)d.dptr;
+	int len = d.dsize;
+	while (len--) {
+		if (isprint(*p) && !strchr("\"\\", *p)) {
+			fputc(*p, stdout);
+		} else {
+			printf("\\%02X", *p);
+		}
+		p++;
+	}
+}
+
+static int traverse_fn(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state)
+{
+	printf("{\n");
+	printf("key(%d) = \"", (int)key.dsize);
+	print_data(key);
+	printf("\"\n");
+	printf("data(%d) = \"", (int)dbuf.dsize);
+	print_data(dbuf);
+	printf("\"\n");
+	printf("}\n");
+	return 0;
+}
+
+static int dump_ntdb(const char *fname, const char *keyname)
+{
+	struct ntdb_context *ntdb;
+	NTDB_DATA key, value;
+
+	ntdb = ntdb_open(fname, 0, O_RDONLY, 0, NULL);
+	if (!ntdb) {
+		printf("Failed to open %s\n", fname);
+		return 1;
+	}
+
+	if (!keyname) {
+		ntdb_traverse(ntdb, traverse_fn, NULL);
+	} else {
+		key = ntdb_mkdata(keyname, strlen(keyname));
+		if (ntdb_fetch(ntdb, key, &value) != 0) {
+			return 1;
+		} else {
+			print_data(value);
+			free(value.dptr);
+		}
+	}
+
+	return 0;
+}
+
+static void usage( void)
+{
+	printf( "Usage: ntdbdump [options] <filename>\n\n");
+	printf( "   -h          this help message\n");
+	printf( "   -k keyname  dumps value of keyname\n");
+}
+
+ int main(int argc, char *argv[])
+{
+	char *fname, *keyname=NULL;
+	int c;
+
+	if (argc < 2) {
+		printf("Usage: ntdbdump <fname>\n");
+		exit(1);
+	}
+
+	while ((c = getopt( argc, argv, "hk:")) != -1) {
+		switch (c) {
+		case 'h':
+			usage();
+			exit( 0);
+		case 'k':
+			keyname = optarg;
+			break;
+		default:
+			usage();
+			exit( 1);
+		}
+	}
+
+	fname = argv[optind];
+
+	return dump_ntdb(fname, keyname);
+}
diff --git a/lib/ntdb/tools/ntdbrestore.c b/lib/ntdb/tools/ntdbrestore.c
new file mode 100644
index 0000000000..dad591d562
--- /dev/null
+++ b/lib/ntdb/tools/ntdbrestore.c
@@ -0,0 +1,231 @@
+/*
+   ntdbrestore -- construct a ntdb from tdbdump output.
+   Copyright (C) Volker Lendecke		2010
+   Copyright (C) Simon McVittie			2005
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "config.h"
+#include "ntdb.h"
+#include <assert.h>
+#ifdef HAVE_LIBREPLACE
+#include <replace.h>
+#include <system/filesys.h>
+#else
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+static int read_linehead(FILE *f)
+{
+	int i, c;
+	int num_bytes;
+	char prefix[128];
+
+	while (1) {
+		c = getc(f);
+		if (c == EOF) {
+			return -1;
+		}
+		if (c == '(') {
+			break;
+		}
+	}
+	for (i=0; i<sizeof(prefix); i++) {
+		c = getc(f);
+		if (c == EOF) {
+			return -1;
+		}
+		prefix[i] = c;
+		if (c == '"') {
+			break;
+		}
+	}
+	if (i == sizeof(prefix)) {
+		return -1;
+	}
+	prefix[i] = '\0';
+
+	if (sscanf(prefix, "%d) = ", &num_bytes) != 1) {
+		return -1;
+	}
+	return num_bytes;
+}
+
+static int read_hex(void) {
+	int c;
+	c = getchar();
+	if (c == EOF) {
+		fprintf(stderr, "Unexpected EOF in data\n");
+		return -1;
+	} else if (c == '"') {
+		fprintf(stderr, "Unexpected \\\" sequence\n");
+		return -1;
+	} else if ('0' <= c && c <= '9')  {
+		return c - '0';
+	} else if ('A' <= c && c <= 'F')  {
+		return c - 'A' + 10;
+	} else if ('a' <= c && c <= 'f')  {
+		return c - 'a' + 10;
+	} else {
+		fprintf(stderr, "Invalid hex: %c\n", c);
+		return -1;
+	}
+}
+
+static int read_data(FILE *f, NTDB_DATA *d, size_t size) {
+	int c, low, high;
+	int i;
+
+	d->dptr = (unsigned char *)malloc(size);
+	if (d->dptr == NULL) {
+		return -1;
+	}
+	d->dsize = size;
+
+	for (i=0; i<size; i++) {
+		c = getc(f);
+		if (c == EOF) {
+			fprintf(stderr, "Unexpected EOF in data\n");
+			return 1;
+		} else if (c == '"') {
+			return 0;
+		} else if (c == '\\') {
+			high = read_hex();
+			if (high < 0) {
+				return -1;
+			}
+			high = high << 4;
+			assert(high == (high & 0xf0));
+			low = read_hex();
+			if (low < 0) {
+				return -1;
+			}
+			assert(low == (low & 0x0f));
+			d->dptr[i] = (low|high);
+		} else {
+			d->dptr[i] = c;
+		}
+	}
+	return 0;
+}
+
+static int swallow(FILE *f, const char *s, int *eof)
+{
+	char line[128];
+
+	if (fgets(line, sizeof(line), f) == NULL) {
+		if (eof != NULL) {
+			*eof = 1;
+		}
+		return -1;
+	}
+	if (strcmp(line, s) != 0) {
+		return -1;
+	}
+	return 0;
+}
+
+static bool read_rec(FILE *f, struct ntdb_context *ntdb, int *eof)
+{
+	int length;
+	NTDB_DATA key, data;
+	bool ret = false;
+	enum NTDB_ERROR e;
+
+	key.dptr = NULL;
+	data.dptr = NULL;
+
+	if (swallow(f, "{\n", eof) == -1) {
+		goto fail;
+	}
+	length = read_linehead(f);
+	if (length == -1) {
+		goto fail;
+	}
+	if (read_data(f, &key, length) == -1) {
+		goto fail;
+	}
+	if (swallow(f, "\"\n", NULL) == -1) {
+		goto fail;
+	}
+	length = read_linehead(f);
+	if (length == -1) {
+		goto fail;
+	}
+	if (read_data(f, &data, length) == -1) {
+		goto fail;
+	}
+	if ((swallow(f, "\"\n", NULL) == -1)
+	    || (swallow(f, "}\n", NULL) == -1)) {
+		goto fail;
+	}
+	e = ntdb_store(ntdb, key, data, NTDB_INSERT);
+	if (e != NTDB_SUCCESS) {
+		fprintf(stderr, "NTDB error: %s\n", ntdb_errorstr(e));
+		goto fail;
+	}
+
+	ret = true;
+fail:
+	free(key.dptr);
+	free(data.dptr);
+	return ret;
+}
+
+static int restore_ntdb(const char *fname)
+{
+	struct ntdb_context *ntdb;
+
+	ntdb = ntdb_open(fname, 0, O_RDWR|O_CREAT|O_EXCL, 0666, NULL);
+	if (!ntdb) {
+		perror("ntdb_open");
+		fprintf(stderr, "Failed to open %s\n", fname);
+		return 1;
+	}
+
+	while (1) {
+		int eof = 0;
+		if (!read_rec(stdin, ntdb, &eof)) {
+			if (eof) {
+				break;
+			}
+			return 1;
+		}
+	}
+	if (ntdb_close(ntdb)) {
+		fprintf(stderr, "Error closing ntdb\n");
+		return 1;
+	}
+	fprintf(stderr, "EOF\n");
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	char *fname;
+
+	if (argc < 2) {
+		printf("Usage: %s dbname < tdbdump_output\n", argv[0]);
+		exit(1);
+	}
+
+	fname = argv[1];
+
+	return restore_ntdb(fname);
+}
diff --git a/lib/ntdb/tools/ntdbtool.c b/lib/ntdb/tools/ntdbtool.c
new file mode 100644
index 0000000000..7c1ef7df7a
--- /dev/null
+++ b/lib/ntdb/tools/ntdbtool.c
@@ -0,0 +1,810 @@
+/*
+   Unix SMB/CIFS implementation.
+   Samba database functions
+   Copyright (C) Andrew Tridgell              1999-2000
+   Copyright (C) Paul `Rusty' Russell		   2000
+   Copyright (C) Jeremy Allison			   2000
+   Copyright (C) Andrew Esh                        2001
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "config.h"
+#include "ntdb.h"
+#ifdef HAVE_LIBREPLACE
+#include <replace.h>
+#include <system/filesys.h>
+#include <system/time.h>
+#include <system/locale.h>
+#else
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#endif
+
+static int do_command(void);
+const char *cmdname;
+char *arg1, *arg2;
+size_t arg1len, arg2len;
+int bIterate = 0;
+char *line;
+NTDB_DATA iterate_kbuf;
+char cmdline[1024];
+static int disable_mmap;
+
+enum commands {
+	CMD_CREATE_NTDB,
+	CMD_OPEN_NTDB,
+	CMD_TRANSACTION_START,
+	CMD_TRANSACTION_COMMIT,
+	CMD_TRANSACTION_CANCEL,
+	CMD_ERASE,
+	CMD_DUMP,
+	CMD_INSERT,
+	CMD_MOVE,
+	CMD_STORE,
+	CMD_SHOW,
+	CMD_KEYS,
+	CMD_HEXKEYS,
+	CMD_DELETE,
+#if 0
+	CMD_LIST_HASH_FREE,
+	CMD_LIST_FREE,
+#endif
+	CMD_INFO,
+	CMD_MMAP,
+	CMD_SPEED,
+	CMD_FIRST,
+	CMD_NEXT,
+	CMD_SYSTEM,
+	CMD_CHECK,
+	CMD_QUIT,
+	CMD_HELP
+};
+
+typedef struct {
+	const char *name;
+	enum commands cmd;
+} COMMAND_TABLE;
+
+COMMAND_TABLE cmd_table[] = {
+	{"create",	CMD_CREATE_NTDB},
+	{"open",	CMD_OPEN_NTDB},
+#if 0
+	{"transaction_start",	CMD_TRANSACTION_START},
+	{"transaction_commit",	CMD_TRANSACTION_COMMIT},
+	{"transaction_cancel",	CMD_TRANSACTION_CANCEL},
+#endif
+	{"erase",	CMD_ERASE},
+	{"dump",	CMD_DUMP},
+	{"insert",	CMD_INSERT},
+	{"move",	CMD_MOVE},
+	{"store",	CMD_STORE},
+	{"show",	CMD_SHOW},
+	{"keys",	CMD_KEYS},
+	{"hexkeys",	CMD_HEXKEYS},
+	{"delete",	CMD_DELETE},
+#if 0
+	{"list",	CMD_LIST_HASH_FREE},
+	{"free",	CMD_LIST_FREE},
+#endif
+	{"info",	CMD_INFO},
+	{"speed",	CMD_SPEED},
+	{"mmap",	CMD_MMAP},
+	{"first",	CMD_FIRST},
+	{"1",		CMD_FIRST},
+	{"next",	CMD_NEXT},
+	{"n",		CMD_NEXT},
+	{"check",	CMD_CHECK},
+	{"quit",	CMD_QUIT},
+	{"q",		CMD_QUIT},
+	{"!",		CMD_SYSTEM},
+	{NULL,		CMD_HELP}
+};
+
+struct timeval tp1,tp2;
+
+static void _start_timer(void)
+{
+	gettimeofday(&tp1,NULL);
+}
+
+static double _end_timer(void)
+{
+	gettimeofday(&tp2,NULL);
+	return((tp2.tv_sec - tp1.tv_sec) +
+	       (tp2.tv_usec - tp1.tv_usec)*1.0e-6);
+}
+
+static void ntdb_log(struct ntdb_context *ntdb,
+		    enum ntdb_log_level level,
+		    enum NTDB_ERROR ecode,
+		    const char *message,
+		    void *data)
+{
+	fprintf(stderr, "ntdb:%s:%s:%s\n",
+		ntdb_name(ntdb), ntdb_errorstr(ecode), message);
+}
+
+/* a ntdb tool for manipulating a ntdb database */
+
+static struct ntdb_context *ntdb;
+
+static int print_rec(struct ntdb_context *the_ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state);
+static int print_key(struct ntdb_context *the_ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state);
+static int print_hexkey(struct ntdb_context *the_ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state);
+
+static void print_asc(const char *buf,int len)
+{
+	int i;
+
+	/* We're probably printing ASCII strings so don't try to display
+	   the trailing NULL character. */
+
+	if (buf[len - 1] == 0)
+	        len--;
+
+	for (i=0;i<len;i++)
+		printf("%c",isprint(buf[i])?buf[i]:'.');
+}
+
+static void print_data(const char *buf,int len)
+{
+	int i=0;
+	if (len<=0) return;
+	printf("[%03X] ",i);
+	for (i=0;i<len;) {
+		printf("%02X ",(int)((unsigned char)buf[i]));
+		i++;
+		if (i%8 == 0) printf(" ");
+		if (i%16 == 0) {
+			print_asc(&buf[i-16],8); printf(" ");
+			print_asc(&buf[i-8],8); printf("\n");
+			if (i<len) printf("[%03X] ",i);
+		}
+	}
+	if (i%16) {
+		int n;
+
+		n = 16 - (i%16);
+		printf(" ");
+		if (n>8) printf(" ");
+		while (n--) printf("   ");
+
+		n = i%16;
+		if (n > 8) n = 8;
+		print_asc(&buf[i-(i%16)],n); printf(" ");
+		n = (i%16) - n;
+		if (n>0) print_asc(&buf[i-n],n);
+		printf("\n");
+	}
+}
+
+static void help(void)
+{
+	printf("\n"
+"tdbtool: \n"
+"  create    dbname     : create a database\n"
+"  open      dbname     : open an existing database\n"
+"  openjh    dbname     : open an existing database (jenkins hash)\n"
+"  transaction_start    : start a transaction\n"
+"  transaction_commit   : commit a transaction\n"
+"  transaction_cancel   : cancel a transaction\n"
+"  erase                : erase the database\n"
+"  dump                 : dump the database as strings\n"
+"  keys                 : dump the database keys as strings\n"
+"  hexkeys              : dump the database keys as hex values\n"
+"  info                 : print summary info about the database\n"
+"  insert    key  data  : insert a record\n"
+"  move      key  file  : move a record to a destination ntdb\n"
+"  store     key  data  : store a record (replace)\n"
+"  show      key        : show a record by key\n"
+"  delete    key        : delete a record by key\n"
+#if 0
+"  list                 : print the database hash table and freelist\n"
+"  free                 : print the database freelist\n"
+#endif
+"  check                : check the integrity of an opened database\n"
+"  speed                : perform speed tests on the database\n"
+"  ! command            : execute system command\n"
+"  1 | first            : print the first record\n"
+"  n | next             : print the next record\n"
+"  q | quit             : terminate\n"
+"  \\n                   : repeat 'next' command\n"
+"\n");
+}
+
+static void terror(enum NTDB_ERROR err, const char *why)
+{
+	if (err != NTDB_SUCCESS)
+		printf("%s:%s\n", ntdb_errorstr(err), why);
+	else
+		printf("%s\n", why);
+}
+
+static void create_ntdb(const char *tdbname)
+{
+	union ntdb_attribute log_attr;
+	log_attr.base.attr = NTDB_ATTRIBUTE_LOG;
+	log_attr.base.next = NULL;
+	log_attr.log.fn = ntdb_log;
+
+	if (ntdb) ntdb_close(ntdb);
+	ntdb = ntdb_open(tdbname, (disable_mmap?NTDB_NOMMAP:0),
+		       O_RDWR | O_CREAT | O_TRUNC, 0600, &log_attr);
+	if (!ntdb) {
+		printf("Could not create %s: %s\n", tdbname, strerror(errno));
+	}
+}
+
+static void open_ntdb(const char *tdbname)
+{
+	union ntdb_attribute log_attr;
+	log_attr.base.attr = NTDB_ATTRIBUTE_LOG;
+	log_attr.base.next = NULL;
+	log_attr.log.fn = ntdb_log;
+
+	if (ntdb) ntdb_close(ntdb);
+	ntdb = ntdb_open(tdbname, disable_mmap?NTDB_NOMMAP:0, O_RDWR, 0600,
+		       &log_attr);
+	if (!ntdb) {
+		printf("Could not open %s: %s\n", tdbname, strerror(errno));
+	}
+}
+
+static void insert_ntdb(char *keyname, size_t keylen, char* data, size_t datalen)
+{
+	NTDB_DATA key, dbuf;
+	enum NTDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(NTDB_SUCCESS, "need key");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+	dbuf.dptr = (unsigned char *)data;
+	dbuf.dsize = datalen;
+
+	ecode = ntdb_store(ntdb, key, dbuf, NTDB_INSERT);
+	if (ecode) {
+		terror(ecode, "insert failed");
+	}
+}
+
+static void store_ntdb(char *keyname, size_t keylen, char* data, size_t datalen)
+{
+	NTDB_DATA key, dbuf;
+	enum NTDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(NTDB_SUCCESS, "need key");
+		return;
+	}
+
+	if ((data == NULL) || (datalen == 0)) {
+		terror(NTDB_SUCCESS, "need data");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+	dbuf.dptr = (unsigned char *)data;
+	dbuf.dsize = datalen;
+
+	printf("Storing key:\n");
+	print_rec(ntdb, key, dbuf, NULL);
+
+	ecode = ntdb_store(ntdb, key, dbuf, NTDB_REPLACE);
+	if (ecode) {
+		terror(ecode, "store failed");
+	}
+}
+
+static void show_ntdb(char *keyname, size_t keylen)
+{
+	NTDB_DATA key, dbuf;
+	enum NTDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(NTDB_SUCCESS, "need key");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+
+	ecode = ntdb_fetch(ntdb, key, &dbuf);
+	if (ecode) {
+		terror(ecode, "fetch failed");
+		return;
+	}
+
+	print_rec(ntdb, key, dbuf, NULL);
+
+	free( dbuf.dptr );
+}
+
+static void delete_ntdb(char *keyname, size_t keylen)
+{
+	NTDB_DATA key;
+	enum NTDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(NTDB_SUCCESS, "need key");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+
+	ecode = ntdb_delete(ntdb, key);
+	if (ecode) {
+		terror(ecode, "delete failed");
+	}
+}
+
+static void move_rec(char *keyname, size_t keylen, char* tdbname)
+{
+	NTDB_DATA key, dbuf;
+	struct ntdb_context *dst_ntdb;
+	enum NTDB_ERROR ecode;
+
+	if ((keyname == NULL) || (keylen == 0)) {
+		terror(NTDB_SUCCESS, "need key");
+		return;
+	}
+
+	if ( !tdbname ) {
+		terror(NTDB_SUCCESS, "need destination ntdb name");
+		return;
+	}
+
+	key.dptr = (unsigned char *)keyname;
+	key.dsize = keylen;
+
+	ecode = ntdb_fetch(ntdb, key, &dbuf);
+	if (ecode) {
+		terror(ecode, "fetch failed");
+		return;
+	}
+
+	print_rec(ntdb, key, dbuf, NULL);
+
+	dst_ntdb = ntdb_open(tdbname, 0, O_RDWR, 0600, NULL);
+	if ( !dst_ntdb ) {
+		terror(NTDB_SUCCESS, "unable to open destination ntdb");
+		return;
+	}
+
+	ecode = ntdb_store( dst_ntdb, key, dbuf, NTDB_REPLACE);
+	if (ecode)
+		terror(ecode, "failed to move record");
+	else
+		printf("record moved\n");
+
+	ntdb_close( dst_ntdb );
+}
+
+static int print_rec(struct ntdb_context *the_ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state)
+{
+	printf("\nkey %d bytes\n", (int)key.dsize);
+	print_asc((const char *)key.dptr, key.dsize);
+	printf("\ndata %d bytes\n", (int)dbuf.dsize);
+	print_data((const char *)dbuf.dptr, dbuf.dsize);
+	return 0;
+}
+
+static int print_key(struct ntdb_context *the_ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state)
+{
+	printf("key %d bytes: ", (int)key.dsize);
+	print_asc((const char *)key.dptr, key.dsize);
+	printf("\n");
+	return 0;
+}
+
+static int print_hexkey(struct ntdb_context *the_ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state)
+{
+	printf("key %d bytes\n", (int)key.dsize);
+	print_data((const char *)key.dptr, key.dsize);
+	printf("\n");
+	return 0;
+}
+
+static int total_bytes;
+
+static int traverse_fn(struct ntdb_context *the_ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *state)
+{
+	total_bytes += dbuf.dsize;
+	return 0;
+}
+
+static void info_ntdb(void)
+{
+	enum NTDB_ERROR ecode;
+	char *summary;
+
+	ecode = ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &summary);
+
+	if (ecode) {
+		terror(ecode, "Getting summary");
+	} else {
+		printf("%s", summary);
+		free(summary);
+	}
+}
+
+static void speed_ntdb(const char *tlimit)
+{
+	unsigned timelimit = tlimit?atoi(tlimit):0;
+	double t;
+	int ops;
+	if (timelimit == 0) timelimit = 5;
+
+	ops = 0;
+	printf("Testing store speed for %u seconds\n", timelimit);
+	_start_timer();
+	do {
+		long int r = random();
+		NTDB_DATA key, dbuf;
+		key = ntdb_mkdata("store test", strlen("store test"));
+		dbuf.dptr = (unsigned char *)&r;
+		dbuf.dsize = sizeof(r);
+		ntdb_store(ntdb, key, dbuf, NTDB_REPLACE);
+		t = _end_timer();
+		ops++;
+	} while (t < timelimit);
+	printf("%10.3f ops/sec\n", ops/t);
+
+	ops = 0;
+	printf("Testing fetch speed for %u seconds\n", timelimit);
+	_start_timer();
+	do {
+		long int r = random();
+		NTDB_DATA key, dbuf;
+		key = ntdb_mkdata("store test", strlen("store test"));
+		dbuf.dptr = (unsigned char *)&r;
+		dbuf.dsize = sizeof(r);
+		ntdb_fetch(ntdb, key, &dbuf);
+		t = _end_timer();
+		ops++;
+	} while (t < timelimit);
+	printf("%10.3f ops/sec\n", ops/t);
+
+	ops = 0;
+	printf("Testing transaction speed for %u seconds\n", timelimit);
+	_start_timer();
+	do {
+		long int r = random();
+		NTDB_DATA key, dbuf;
+		key = ntdb_mkdata("transaction test", strlen("transaction test"));
+		dbuf.dptr = (unsigned char *)&r;
+		dbuf.dsize = sizeof(r);
+		ntdb_transaction_start(ntdb);
+		ntdb_store(ntdb, key, dbuf, NTDB_REPLACE);
+		ntdb_transaction_commit(ntdb);
+		t = _end_timer();
+		ops++;
+	} while (t < timelimit);
+	printf("%10.3f ops/sec\n", ops/t);
+
+	ops = 0;
+	printf("Testing traverse speed for %u seconds\n", timelimit);
+	_start_timer();
+	do {
+		ntdb_traverse(ntdb, traverse_fn, NULL);
+		t = _end_timer();
+		ops++;
+	} while (t < timelimit);
+	printf("%10.3f ops/sec\n", ops/t);
+}
+
+static void toggle_mmap(void)
+{
+	disable_mmap = !disable_mmap;
+	if (disable_mmap) {
+		printf("mmap is disabled\n");
+	} else {
+		printf("mmap is enabled\n");
+	}
+}
+
+static char *ntdb_getline(const char *prompt)
+{
+	static char thisline[1024];
+	char *p;
+	fputs(prompt, stdout);
+	thisline[0] = 0;
+	p = fgets(thisline, sizeof(thisline)-1, stdin);
+	if (p) p = strchr(p, '\n');
+	if (p) *p = 0;
+	return p?thisline:NULL;
+}
+
+static int do_delete_fn(struct ntdb_context *the_ntdb, NTDB_DATA key, NTDB_DATA dbuf,
+                     void *state)
+{
+    return ntdb_delete(the_ntdb, key);
+}
+
+static void first_record(struct ntdb_context *the_ntdb, NTDB_DATA *pkey)
+{
+	NTDB_DATA dbuf;
+	enum NTDB_ERROR ecode;
+	ecode = ntdb_firstkey(the_ntdb, pkey);
+	if (!ecode)
+		ecode = ntdb_fetch(the_ntdb, *pkey, &dbuf);
+	if (ecode) terror(ecode, "fetch failed");
+	else {
+		print_rec(the_ntdb, *pkey, dbuf, NULL);
+	}
+}
+
+static void next_record(struct ntdb_context *the_ntdb, NTDB_DATA *pkey)
+{
+	NTDB_DATA dbuf;
+	enum NTDB_ERROR ecode;
+	ecode = ntdb_nextkey(the_ntdb, pkey);
+
+	if (!ecode)
+		ecode = ntdb_fetch(the_ntdb, *pkey, &dbuf);
+	if (ecode)
+		terror(ecode, "fetch failed");
+	else
+		print_rec(the_ntdb, *pkey, dbuf, NULL);
+}
+
+static void check_db(struct ntdb_context *the_ntdb)
+{
+	if (!the_ntdb) {
+		printf("Error: No database opened!\n");
+	} else {
+		if (ntdb_check(the_ntdb, NULL, NULL) != 0)
+			printf("Integrity check for the opened database failed.\n");
+		else
+			printf("Database integrity is OK.\n");
+	}
+}
+
+static int do_command(void)
+{
+	COMMAND_TABLE *ctp = cmd_table;
+	enum commands mycmd = CMD_HELP;
+	int cmd_len;
+
+	if (cmdname && strlen(cmdname) == 0) {
+		mycmd = CMD_NEXT;
+	} else {
+		while (ctp->name) {
+			cmd_len = strlen(ctp->name);
+			if (strncmp(ctp->name,cmdname,cmd_len) == 0) {
+				mycmd = ctp->cmd;
+				break;
+			}
+			ctp++;
+		}
+	}
+
+	switch (mycmd) {
+	case CMD_CREATE_NTDB:
+		bIterate = 0;
+		create_ntdb(arg1);
+		return 0;
+	case CMD_OPEN_NTDB:
+		bIterate = 0;
+		open_ntdb(arg1);
+		return 0;
+	case CMD_SYSTEM:
+		/* Shell command */
+		if (system(arg1) == -1) {
+			terror(NTDB_SUCCESS, "system() call failed\n");
+		}
+		return 0;
+	case CMD_QUIT:
+		return 1;
+	default:
+		/* all the rest require a open database */
+		if (!ntdb) {
+			bIterate = 0;
+			terror(NTDB_SUCCESS, "database not open");
+			help();
+			return 0;
+		}
+		switch (mycmd) {
+		case CMD_TRANSACTION_START:
+			bIterate = 0;
+			ntdb_transaction_start(ntdb);
+			return 0;
+		case CMD_TRANSACTION_COMMIT:
+			bIterate = 0;
+			ntdb_transaction_commit(ntdb);
+			return 0;
+		case CMD_TRANSACTION_CANCEL:
+			bIterate = 0;
+			ntdb_transaction_cancel(ntdb);
+			return 0;
+		case CMD_ERASE:
+			bIterate = 0;
+			ntdb_traverse(ntdb, do_delete_fn, NULL);
+			return 0;
+		case CMD_DUMP:
+			bIterate = 0;
+			ntdb_traverse(ntdb, print_rec, NULL);
+			return 0;
+		case CMD_INSERT:
+			bIterate = 0;
+			insert_ntdb(arg1, arg1len,arg2,arg2len);
+			return 0;
+		case CMD_MOVE:
+			bIterate = 0;
+			move_rec(arg1,arg1len,arg2);
+			return 0;
+		case CMD_STORE:
+			bIterate = 0;
+			store_ntdb(arg1,arg1len,arg2,arg2len);
+			return 0;
+		case CMD_SHOW:
+			bIterate = 0;
+			show_ntdb(arg1, arg1len);
+			return 0;
+		case CMD_KEYS:
+			ntdb_traverse(ntdb, print_key, NULL);
+			return 0;
+		case CMD_HEXKEYS:
+			ntdb_traverse(ntdb, print_hexkey, NULL);
+			return 0;
+		case CMD_DELETE:
+			bIterate = 0;
+			delete_ntdb(arg1,arg1len);
+			return 0;
+#if 0
+		case CMD_LIST_HASH_FREE:
+			ntdb_dump_all(ntdb);
+			return 0;
+		case CMD_LIST_FREE:
+			ntdb_printfreelist(ntdb);
+			return 0;
+#endif
+		case CMD_INFO:
+			info_ntdb();
+			return 0;
+		case CMD_SPEED:
+			speed_ntdb(arg1);
+			return 0;
+		case CMD_MMAP:
+			toggle_mmap();
+			return 0;
+		case CMD_FIRST:
+			bIterate = 1;
+			first_record(ntdb, &iterate_kbuf);
+			return 0;
+		case CMD_NEXT:
+			if (bIterate)
+				next_record(ntdb, &iterate_kbuf);
+			return 0;
+		case CMD_CHECK:
+			check_db(ntdb);
+			return 0;
+		case CMD_HELP:
+			help();
+			return 0;
+		case CMD_CREATE_NTDB:
+		case CMD_OPEN_NTDB:
+		case CMD_SYSTEM:
+		case CMD_QUIT:
+			/*
+			 * unhandled commands.  cases included here to avoid compiler
+			 * warnings.
+			 */
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+static char *convert_string(char *instring, size_t *sizep)
+{
+	size_t length = 0;
+	char *outp, *inp;
+	char temp[3];
+
+	outp = inp = instring;
+
+	while (*inp) {
+		if (*inp == '\\') {
+			inp++;
+			if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
+				temp[0] = *inp++;
+				temp[1] = '\0';
+				if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
+					temp[1] = *inp++;
+					temp[2] = '\0';
+				}
+				*outp++ = (char)strtol((const char *)temp,NULL,16);
+			} else {
+				*outp++ = *inp++;
+			}
+		} else {
+			*outp++ = *inp++;
+		}
+		length++;
+	}
+	*sizep = length;
+	return instring;
+}
+
+int main(int argc, char *argv[])
+{
+	cmdname = "";
+	arg1 = NULL;
+	arg1len = 0;
+	arg2 = NULL;
+	arg2len = 0;
+
+	if (argv[1]) {
+		cmdname = "open";
+		arg1 = argv[1];
+		do_command();
+		cmdname =  "";
+		arg1 = NULL;
+	}
+
+	switch (argc) {
+	case 1:
+	case 2:
+		/* Interactive mode */
+		while ((cmdname = ntdb_getline("ntdb> "))) {
+			arg2 = arg1 = NULL;
+			if ((arg1 = strchr((const char *)cmdname,' ')) != NULL) {
+				arg1++;
+				arg2 = arg1;
+				while (*arg2) {
+					if (*arg2 == ' ') {
+						*arg2++ = '\0';
+						break;
+					}
+					if ((*arg2++ == '\\') && (*arg2 == ' ')) {
+						arg2++;
+					}
+				}
+			}
+			if (arg1) arg1 = convert_string(arg1,&arg1len);
+			if (arg2) arg2 = convert_string(arg2,&arg2len);
+			if (do_command()) break;
+		}
+		break;
+	case 5:
+		arg2 = convert_string(argv[4],&arg2len);
+	case 4:
+		arg1 = convert_string(argv[3],&arg1len);
+	case 3:
+		cmdname = argv[2];
+	default:
+		do_command();
+		break;
+	}
+
+	if (ntdb) ntdb_close(ntdb);
+
+	return 0;
+}
diff --git a/lib/ntdb/tools/ntdbtorture.c b/lib/ntdb/tools/ntdbtorture.c
new file mode 100644
index 0000000000..c7b249db06
--- /dev/null
+++ b/lib/ntdb/tools/ntdbtorture.c
@@ -0,0 +1,529 @@
+/* this tests ntdb by doing lots of ops from several simultaneous
+   writers - that stresses the locking code.
+*/
+
+#include "config.h"
+#include "ntdb.h"
+#include <ccan/err/err.h>
+#ifdef HAVE_LIBREPLACE
+#include <replace.h>
+#else
+#include <stdlib.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/wait.h>
+#endif
+
+//#define REOPEN_PROB 30
+#define DELETE_PROB 8
+#define STORE_PROB 4
+#define APPEND_PROB 6
+#define TRANSACTION_PROB 10
+#define TRANSACTION_PREPARE_PROB 2
+#define LOCKSTORE_PROB 5
+#define TRAVERSE_PROB 20
+#define TRAVERSE_MOD_PROB 100
+#define TRAVERSE_ABORT_PROB 500
+#define CULL_PROB 100
+#define KEYLEN 3
+#define DATALEN 100
+
+static struct ntdb_context *db;
+static int in_transaction;
+static int in_traverse;
+static int error_count;
+#if TRANSACTION_PROB
+static int always_transaction = 0;
+#endif
+static int loopnum;
+static int count_pipe;
+static union ntdb_attribute log_attr;
+static union ntdb_attribute seed_attr;
+
+static void ntdb_log(struct ntdb_context *ntdb,
+		    enum ntdb_log_level level,
+		    enum NTDB_ERROR ecode,
+		    const char *message,
+		    void *data)
+{
+	printf("ntdb:%s:%s:%s\n",
+	       ntdb_name(ntdb), ntdb_errorstr(ecode), message);
+	fflush(stdout);
+#if 0
+	{
+		char str[200];
+		signal(SIGUSR1, SIG_IGN);
+		sprintf(str,"xterm -e gdb /proc/%d/exe %d", getpid(), getpid());
+		system(str);
+	}
+#endif
+}
+
+#include "../private.h"
+
+static void segv_handler(int sig, siginfo_t *info, void *p)
+{
+	char string[100];
+
+	sprintf(string, "%u: death at %p (map_ptr %p, map_size %zu)\n",
+		getpid(), info->si_addr, db->file->map_ptr,
+		(size_t)db->file->map_size);
+	if (write(2, string, strlen(string)) > 0)
+		sleep(60);
+	_exit(11);
+}
+
+static void fatal(struct ntdb_context *ntdb, const char *why)
+{
+	fprintf(stderr, "%u:%s:%s\n", getpid(), why,
+		ntdb ? ntdb_errorstr(ntdb_error(ntdb)) : "(no ntdb)");
+	error_count++;
+}
+
+static char *randbuf(int len)
+{
+	char *buf;
+	int i;
+	buf = (char *)malloc(len+1);
+
+	for (i=0;i<len;i++) {
+		buf[i] = 'a' + (rand() % 26);
+	}
+	buf[i] = 0;
+	return buf;
+}
+
+static void addrec_db(void);
+static int modify_traverse(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf,
+			   void *state)
+{
+#if CULL_PROB
+	if (random() % CULL_PROB == 0) {
+		ntdb_delete(ntdb, key);
+	}
+#endif
+
+#if TRAVERSE_MOD_PROB
+	if (random() % TRAVERSE_MOD_PROB == 0) {
+		addrec_db();
+	}
+#endif
+
+#if TRAVERSE_ABORT_PROB
+	if (random() % TRAVERSE_ABORT_PROB == 0)
+		return 1;
+#endif
+
+	return 0;
+}
+
+static void addrec_db(void)
+{
+	int klen, dlen;
+	char *k, *d;
+	NTDB_DATA key, data;
+
+	klen = 1 + (rand() % KEYLEN);
+	dlen = 1 + (rand() % DATALEN);
+
+	k = randbuf(klen);
+	d = randbuf(dlen);
+
+	key.dptr = (unsigned char *)k;
+	key.dsize = klen+1;
+
+	data.dptr = (unsigned char *)d;
+	data.dsize = dlen+1;
+
+#if REOPEN_PROB
+	if (in_traverse == 0 && in_transaction == 0 && random() % REOPEN_PROB == 0) {
+		ntdb_reopen_all(0);
+		goto next;
+	}
+#endif
+
+#if TRANSACTION_PROB
+	if (in_traverse == 0 && in_transaction == 0 && (always_transaction || random() % TRANSACTION_PROB == 0)) {
+		if (ntdb_transaction_start(db) != 0) {
+			fatal(db, "ntdb_transaction_start failed");
+		}
+		in_transaction++;
+		goto next;
+	}
+	if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) {
+		if (random() % TRANSACTION_PREPARE_PROB == 0) {
+			if (ntdb_transaction_prepare_commit(db) != 0) {
+				fatal(db, "ntdb_transaction_prepare_commit failed");
+			}
+		}
+		if (ntdb_transaction_commit(db) != 0) {
+			fatal(db, "ntdb_transaction_commit failed");
+		}
+		in_transaction--;
+		goto next;
+	}
+
+	if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) {
+		ntdb_transaction_cancel(db);
+		in_transaction--;
+		goto next;
+	}
+#endif
+
+#if DELETE_PROB
+	if (random() % DELETE_PROB == 0) {
+		ntdb_delete(db, key);
+		goto next;
+	}
+#endif
+
+#if STORE_PROB
+	if (random() % STORE_PROB == 0) {
+		if (ntdb_store(db, key, data, NTDB_REPLACE) != 0) {
+			fatal(db, "ntdb_store failed");
+		}
+		goto next;
+	}
+#endif
+
+#if APPEND_PROB
+	if (random() % APPEND_PROB == 0) {
+		if (ntdb_append(db, key, data) != 0) {
+			fatal(db, "ntdb_append failed");
+		}
+		goto next;
+	}
+#endif
+
+#if LOCKSTORE_PROB
+	if (random() % LOCKSTORE_PROB == 0) {
+		ntdb_chainlock(db, key);
+		if (ntdb_fetch(db, key, &data) != NTDB_SUCCESS) {
+			data.dsize = 0;
+			data.dptr = NULL;
+		}
+		if (ntdb_store(db, key, data, NTDB_REPLACE) != 0) {
+			fatal(db, "ntdb_store failed");
+		}
+		if (data.dptr) free(data.dptr);
+		ntdb_chainunlock(db, key);
+		goto next;
+	}
+#endif
+
+#if TRAVERSE_PROB
+	/* FIXME: recursive traverses break transactions? */
+	if (in_traverse == 0 && random() % TRAVERSE_PROB == 0) {
+		in_traverse++;
+		ntdb_traverse(db, modify_traverse, NULL);
+		in_traverse--;
+		goto next;
+	}
+#endif
+
+	if (ntdb_fetch(db, key, &data) == NTDB_SUCCESS)
+		free(data.dptr);
+
+next:
+	free(k);
+	free(d);
+}
+
+static int traverse_fn(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf,
+                       void *state)
+{
+	ntdb_delete(ntdb, key);
+	return 0;
+}
+
+static void usage(void)
+{
+	printf("Usage: ntdbtorture"
+#if TRANSACTION_PROB
+	       " [-t]"
+#endif
+	       " [-k] [-n NUM_PROCS] [-l NUM_LOOPS] [-s SEED] [-S]\n");
+	exit(0);
+}
+
+static void send_count_and_suicide(int sig)
+{
+	/* This ensures our successor can continue where we left off. */
+	if (write(count_pipe, &loopnum, sizeof(loopnum)) != sizeof(loopnum))
+		exit(2);
+	/* This gives a unique signature. */
+	kill(getpid(), SIGUSR2);
+}
+
+static int run_child(const char *filename, int i, int seed, unsigned num_loops,
+		     unsigned start, int ntdb_flags)
+{
+	struct sigaction act = { .sa_sigaction = segv_handler,
+				 .sa_flags = SA_SIGINFO };
+	sigaction(11, &act, NULL);
+
+	db = ntdb_open(filename, ntdb_flags, O_RDWR | O_CREAT, 0600,
+		      &log_attr);
+	if (!db) {
+		fatal(NULL, "db open failed");
+	}
+
+#if 0
+	if (i == 0) {
+		printf("pid %i\n", getpid());
+		sleep(9);
+	} else
+		sleep(10);
+#endif
+
+	srand(seed + i);
+	srandom(seed + i);
+
+	/* Set global, then we're ready to handle being killed. */
+	loopnum = start;
+	signal(SIGUSR1, send_count_and_suicide);
+
+	for (;loopnum<num_loops && error_count == 0;loopnum++) {
+		addrec_db();
+	}
+
+	if (error_count == 0) {
+		ntdb_traverse(db, NULL, NULL);
+#if TRANSACTION_PROB
+		if (always_transaction) {
+			while (in_transaction) {
+				ntdb_transaction_cancel(db);
+				in_transaction--;
+			}
+			if (ntdb_transaction_start(db) != 0)
+				fatal(db, "ntdb_transaction_start failed");
+		}
+#endif
+		ntdb_traverse(db, traverse_fn, NULL);
+		ntdb_traverse(db, traverse_fn, NULL);
+
+#if TRANSACTION_PROB
+		if (always_transaction) {
+			if (ntdb_transaction_commit(db) != 0)
+				fatal(db, "ntdb_transaction_commit failed");
+		}
+#endif
+	}
+
+	ntdb_close(db);
+
+	return (error_count < 100 ? error_count : 100);
+}
+
+static char *test_path(const char *filename)
+{
+	const char *prefix = getenv("TEST_DATA_PREFIX");
+
+	if (prefix) {
+		char *path = NULL;
+		int ret;
+
+		ret = asprintf(&path, "%s/%s", prefix, filename);
+		if (ret == -1) {
+			return NULL;
+		}
+		return path;
+	}
+
+	return strdup(filename);
+}
+
+int main(int argc, char * const *argv)
+{
+	int i, seed = -1;
+	int num_loops = 5000;
+	int num_procs = 3;
+	int c, pfds[2];
+	extern char *optarg;
+	pid_t *pids;
+	int kill_random = 0;
+	int *done;
+	int ntdb_flags = NTDB_DEFAULT;
+	char *test_ntdb;
+
+	log_attr.base.attr = NTDB_ATTRIBUTE_LOG;
+	log_attr.base.next = &seed_attr;
+	log_attr.log.fn = ntdb_log;
+	seed_attr.base.attr = NTDB_ATTRIBUTE_SEED;
+	seed_attr.base.next = NULL;
+
+	while ((c = getopt(argc, argv, "n:l:s:thkS")) != -1) {
+		switch (c) {
+		case 'n':
+			num_procs = strtol(optarg, NULL, 0);
+			break;
+		case 'l':
+			num_loops = strtol(optarg, NULL, 0);
+			break;
+		case 's':
+			seed = strtol(optarg, NULL, 0);
+			break;
+		case 'S':
+			ntdb_flags = NTDB_NOSYNC;
+			break;
+		case 't':
+#if TRANSACTION_PROB
+			always_transaction = 1;
+#else
+			fprintf(stderr, "Transactions not supported\n");
+			usage();
+#endif
+			break;
+		case 'k':
+			kill_random = 1;
+			break;
+		default:
+			usage();
+		}
+	}
+
+	test_ntdb = test_path("torture.ntdb");
+
+	unlink(test_ntdb);
+
+	if (seed == -1) {
+		seed = (getpid() + time(NULL)) & 0x7FFFFFFF;
+	}
+	seed_attr.seed.seed = (((uint64_t)seed) << 32) | seed;
+
+	if (num_procs == 1 && !kill_random) {
+		/* Don't fork for this case, makes debugging easier. */
+		error_count = run_child(test_ntdb, 0, seed, num_loops, 0,
+					ntdb_flags);
+		goto done;
+	}
+
+	pids = (pid_t *)calloc(sizeof(pid_t), num_procs);
+	done = (int *)calloc(sizeof(int), num_procs);
+
+	if (pipe(pfds) != 0) {
+		perror("Creating pipe");
+		exit(1);
+	}
+	count_pipe = pfds[1];
+
+	for (i=0;i<num_procs;i++) {
+		if ((pids[i]=fork()) == 0) {
+			close(pfds[0]);
+			if (i == 0) {
+				printf("testing with %d processes, %d loops, seed=%d%s\n",
+				       num_procs, num_loops, seed,
+#if TRANSACTION_PROB
+				       always_transaction ? " (all within transactions)" : ""
+#else
+				       ""
+#endif
+					);
+			}
+			exit(run_child(test_ntdb, i, seed, num_loops, 0,
+				       ntdb_flags));
+		}
+	}
+
+	while (num_procs) {
+		int status, j;
+		pid_t pid;
+
+		if (error_count != 0) {
+			/* try and stop the test on any failure */
+			for (j=0;j<num_procs;j++) {
+				if (pids[j] != 0) {
+					kill(pids[j], SIGTERM);
+				}
+			}
+		}
+
+		pid = waitpid(-1, &status, kill_random ? WNOHANG : 0);
+		if (pid == 0) {
+			struct timespec ts;
+
+			/* Sleep for 1/10 second. */
+			ts.tv_sec = 0;
+			ts.tv_nsec = 100000000;
+			nanosleep(&ts, NULL);
+
+			/* Kill someone. */
+			kill(pids[random() % num_procs], SIGUSR1);
+			continue;
+		}
+
+		if (pid == -1) {
+			perror("failed to wait for child\n");
+			exit(1);
+		}
+
+		for (j=0;j<num_procs;j++) {
+			if (pids[j] == pid) break;
+		}
+		if (j == num_procs) {
+			printf("unknown child %d exited!?\n", (int)pid);
+			exit(1);
+		}
+		if (WIFSIGNALED(status)) {
+			if (WTERMSIG(status) == SIGUSR2
+			    || WTERMSIG(status) == SIGUSR1) {
+				/* SIGUSR2 means they wrote to pipe. */
+				if (WTERMSIG(status) == SIGUSR2) {
+					if (read(pfds[0], &done[j],
+						 sizeof(done[j]))
+					    != sizeof(done[j]))
+						err(1,
+						    "Short read from child?");
+				}
+				pids[j] = fork();
+				if (pids[j] == 0)
+					exit(run_child(test_ntdb, j, seed,
+						       num_loops, done[j],
+						       ntdb_flags));
+				printf("Restarting child %i for %u-%u\n",
+				       j, done[j], num_loops);
+				continue;
+			}
+			printf("child %d exited with signal %d\n",
+			       (int)pid, WTERMSIG(status));
+			error_count++;
+		} else {
+			if (WEXITSTATUS(status) != 0) {
+				printf("child %d exited with status %d\n",
+				       (int)pid, WEXITSTATUS(status));
+				error_count++;
+			}
+		}
+		memmove(&pids[j], &pids[j+1],
+			(num_procs - j - 1)*sizeof(pids[0]));
+		num_procs--;
+	}
+
+	free(pids);
+
+done:
+	if (error_count == 0) {
+		db = ntdb_open(test_ntdb, NTDB_DEFAULT, O_RDWR | O_CREAT,
+			      0600, &log_attr);
+		if (!db) {
+			fatal(db, "db open failed");
+			exit(1);
+		}
+		if (ntdb_check(db, NULL, NULL) != 0) {
+			fatal(db, "db check failed");
+			exit(1);
+		}
+		ntdb_close(db);
+		printf("OK\n");
+	}
+
+	free(test_ntdb);
+	return error_count;
+}
diff --git a/lib/ntdb/tools/speed.c b/lib/ntdb/tools/speed.c
new file mode 100644
index 0000000000..868494b898
--- /dev/null
+++ b/lib/ntdb/tools/speed.c
@@ -0,0 +1,443 @@
+/* Simple speed test for NTDB */
+#include <ccan/err/err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include "ntdb.h"
+
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+
+	timersub(stop, start, &diff);
+
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+
+static size_t file_size(void)
+{
+	struct stat st;
+
+	if (stat("/tmp/speed.ntdb", &st) != 0)
+		return -1;
+	return st.st_size;
+}
+
+static int count_record(struct ntdb_context *ntdb,
+			NTDB_DATA key, NTDB_DATA data, void *p)
+{
+	int *total = p;
+	*total += *(int *)data.dptr;
+	return 0;
+}
+
+static void dump_and_clear_stats(struct ntdb_context **ntdb,
+				 int flags,
+				 union ntdb_attribute *attr)
+{
+	union ntdb_attribute stats;
+	enum NTDB_ERROR ecode;
+
+	stats.base.attr = NTDB_ATTRIBUTE_STATS;
+	stats.stats.size = sizeof(stats.stats);
+	ecode = ntdb_get_attribute(*ntdb, &stats);
+	if (ecode != NTDB_SUCCESS)
+		errx(1, "Getting stats: %s", ntdb_errorstr(ecode));
+
+	printf("allocs = %llu\n",
+	       (unsigned long long)stats.stats.allocs);
+	printf("  alloc_subhash = %llu\n",
+	       (unsigned long long)stats.stats.alloc_subhash);
+	printf("  alloc_chain = %llu\n",
+	       (unsigned long long)stats.stats.alloc_chain);
+	printf("  alloc_bucket_exact = %llu\n",
+	       (unsigned long long)stats.stats.alloc_bucket_exact);
+	printf("  alloc_bucket_max = %llu\n",
+	       (unsigned long long)stats.stats.alloc_bucket_max);
+	printf("  alloc_leftover = %llu\n",
+	       (unsigned long long)stats.stats.alloc_leftover);
+	printf("  alloc_coalesce_tried = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_tried);
+	printf("    alloc_coalesce_iterate_clash = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_iterate_clash);
+	printf("    alloc_coalesce_lockfail = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_lockfail);
+	printf("    alloc_coalesce_race = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_race);
+	printf("    alloc_coalesce_succeeded = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_succeeded);
+	printf("      alloc_coalesce_num_merged = %llu\n",
+	       (unsigned long long)stats.stats.alloc_coalesce_num_merged);
+	printf("compares = %llu\n",
+	       (unsigned long long)stats.stats.compares);
+	printf("  compare_wrong_bucket = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_bucket);
+	printf("  compare_wrong_offsetbits = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_offsetbits);
+	printf("  compare_wrong_keylen = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_keylen);
+	printf("  compare_wrong_rechash = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_rechash);
+	printf("  compare_wrong_keycmp = %llu\n",
+	       (unsigned long long)stats.stats.compare_wrong_keycmp);
+	printf("transactions = %llu\n",
+	       (unsigned long long)stats.stats.transactions);
+	printf("  transaction_cancel = %llu\n",
+	       (unsigned long long)stats.stats.transaction_cancel);
+	printf("  transaction_nest = %llu\n",
+	       (unsigned long long)stats.stats.transaction_nest);
+	printf("  transaction_expand_file = %llu\n",
+	       (unsigned long long)stats.stats.transaction_expand_file);
+	printf("  transaction_read_direct = %llu\n",
+	       (unsigned long long)stats.stats.transaction_read_direct);
+	printf("    transaction_read_direct_fail = %llu\n",
+	       (unsigned long long)stats.stats.transaction_read_direct_fail);
+	printf("  transaction_write_direct = %llu\n",
+	       (unsigned long long)stats.stats.transaction_write_direct);
+	printf("    transaction_write_direct_fail = %llu\n",
+	       (unsigned long long)stats.stats.transaction_write_direct_fail);
+	printf("expands = %llu\n",
+	       (unsigned long long)stats.stats.expands);
+	printf("frees = %llu\n",
+	       (unsigned long long)stats.stats.frees);
+	printf("locks = %llu\n",
+	       (unsigned long long)stats.stats.locks);
+	printf("  lock_lowlevel = %llu\n",
+	       (unsigned long long)stats.stats.lock_lowlevel);
+	printf("  lock_nonblock = %llu\n",
+	       (unsigned long long)stats.stats.lock_nonblock);
+	printf("    lock_nonblock_fail = %llu\n",
+	       (unsigned long long)stats.stats.lock_nonblock_fail);
+
+	/* Now clear. */
+	ntdb_close(*ntdb);
+	*ntdb = ntdb_open("/tmp/speed.ntdb", flags, O_RDWR, 0, attr);
+}
+
+static void ntdb_log(struct ntdb_context *ntdb,
+		    enum ntdb_log_level level,
+		    enum NTDB_ERROR ecode,
+		    const char *message,
+		    void *data)
+{
+	fprintf(stderr, "ntdb:%s:%s:%s\n",
+		ntdb_name(ntdb), ntdb_errorstr(ecode), message);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+	int flags = NTDB_DEFAULT;
+	bool transaction = false, summary = false;
+	NTDB_DATA key, data;
+	struct ntdb_context *ntdb;
+	struct timeval start, stop;
+	union ntdb_attribute seed, log;
+	bool do_stats = false;
+	enum NTDB_ERROR ecode;
+
+	/* Try to keep benchmarks even. */
+	seed.base.attr = NTDB_ATTRIBUTE_SEED;
+	seed.base.next = NULL;
+	seed.seed.seed = 0;
+
+	log.base.attr = NTDB_ATTRIBUTE_LOG;
+	log.base.next = &seed;
+	log.log.fn = ntdb_log;
+
+	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+		flags = NTDB_INTERNAL;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+		transaction = true;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--no-sync") == 0) {
+		flags |= NTDB_NOSYNC;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--summary") == 0) {
+		summary = true;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--stats") == 0) {
+		do_stats = true;
+		argc--;
+		argv++;
+	}
+
+	ntdb = ntdb_open("/tmp/speed.ntdb", flags, O_RDWR|O_CREAT|O_TRUNC,
+		       0600, &log);
+	if (!ntdb)
+		err(1, "Opening /tmp/speed.ntdb");
+
+	key.dptr = (void *)&i;
+	key.dsize = sizeof(i);
+	data = key;
+
+	if (argv[1]) {
+		num = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (argv[1]) {
+		stopat = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	/* Add 1000 records. */
+	printf("Adding %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = ntdb_transaction_start(ntdb)))
+		errx(1, "starting transaction: %s", ntdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if ((ecode = ntdb_store(ntdb, key, data, NTDB_INSERT)) != 0)
+			errx(1, "Inserting key %u in ntdb: %s",
+			     i, ntdb_errorstr(ecode));
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = ntdb_transaction_commit(ntdb)))
+		errx(1, "committing transaction: %s", ntdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	if (ntdb_check(ntdb, NULL, NULL))
+		errx(1, "ntdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&ntdb, flags, &log);
+
+	if (++stage == stopat)
+		exit(0);
+
+	/* Finding 1000 records. */
+	printf("Finding %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = ntdb_transaction_start(ntdb)))
+		errx(1, "starting transaction: %s", ntdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		NTDB_DATA dbuf;
+		if ((ecode = ntdb_fetch(ntdb, key, &dbuf)) != NTDB_SUCCESS
+		    || *(int *)dbuf.dptr != i) {
+			errx(1, "Fetching key %u in ntdb gave %u",
+			     i, ecode ? ecode : *(int *)dbuf.dptr);
+		}
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = ntdb_transaction_commit(ntdb)))
+		errx(1, "committing transaction: %s", ntdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (ntdb_check(ntdb, NULL, NULL))
+		errx(1, "ntdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&ntdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Missing 1000 records. */
+	printf("Missing %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = ntdb_transaction_start(ntdb)))
+		errx(1, "starting transaction: %s", ntdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (i = num; i < num*2; i++) {
+		NTDB_DATA dbuf;
+		ecode = ntdb_fetch(ntdb, key, &dbuf);
+		if (ecode != NTDB_ERR_NOEXIST)
+			errx(1, "Fetching key %u in ntdb gave %s",
+			     i, ntdb_errorstr(ecode));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = ntdb_transaction_commit(ntdb)))
+		errx(1, "committing transaction: %s", ntdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (ntdb_check(ntdb, NULL, NULL))
+		errx(1, "ntdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&ntdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Traverse 1000 records. */
+	printf("Traversing %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = ntdb_transaction_start(ntdb)))
+		errx(1, "starting transaction: %s", ntdb_errorstr(ecode));
+	i = 0;
+	gettimeofday(&start, NULL);
+	if (ntdb_traverse(ntdb, count_record, &i) != num)
+		errx(1, "Traverse returned wrong number of records");
+	if (i != (num - 1) * (num / 2))
+		errx(1, "Traverse tallied to %u", i);
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = ntdb_transaction_commit(ntdb)))
+		errx(1, "committing transaction: %s", ntdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (ntdb_check(ntdb, NULL, NULL))
+		errx(1, "ntdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&ntdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Delete 1000 records (not in order). */
+	printf("Deleting %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = ntdb_transaction_start(ntdb)))
+		errx(1, "starting transaction: %s", ntdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if ((ecode = ntdb_delete(ntdb, key)) != NTDB_SUCCESS)
+			errx(1, "Deleting key %u in ntdb: %s",
+			     i, ntdb_errorstr(ecode));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = ntdb_transaction_commit(ntdb)))
+		errx(1, "committing transaction: %s", ntdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (ntdb_check(ntdb, NULL, NULL))
+		errx(1, "ntdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&ntdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Re-add 1000 records (not in order). */
+	printf("Re-adding %u records: ", num); fflush(stdout);
+	if (transaction && (ecode = ntdb_transaction_start(ntdb)))
+		errx(1, "starting transaction: %s", ntdb_errorstr(ecode));
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if ((ecode = ntdb_store(ntdb, key, data, NTDB_INSERT)) != 0)
+			errx(1, "Inserting key %u in ntdb: %s",
+			     i, ntdb_errorstr(ecode));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = ntdb_transaction_commit(ntdb)))
+		errx(1, "committing transaction: %s", ntdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (ntdb_check(ntdb, NULL, NULL))
+		errx(1, "ntdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&ntdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	/* Append 1000 records. */
+	if (transaction && (ecode = ntdb_transaction_start(ntdb)))
+		errx(1, "starting transaction: %s", ntdb_errorstr(ecode));
+	printf("Appending %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if ((ecode = ntdb_append(ntdb, key, data)) != NTDB_SUCCESS)
+			errx(1, "Appending key %u in ntdb: %s",
+			     i, ntdb_errorstr(ecode));
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = ntdb_transaction_commit(ntdb)))
+		errx(1, "committing transaction: %s", ntdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (ntdb_check(ntdb, NULL, NULL))
+		errx(1, "ntdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (++stage == stopat)
+		exit(0);
+
+	/* Churn 1000 records: not in order! */
+	if (transaction && (ecode = ntdb_transaction_start(ntdb)))
+		errx(1, "starting transaction: %s", ntdb_errorstr(ecode));
+	printf("Churning %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 1000019) % num;
+		if ((ecode = ntdb_delete(ntdb, key)) != NTDB_SUCCESS)
+			errx(1, "Deleting key %u in ntdb: %s",
+			     i, ntdb_errorstr(ecode));
+		i += num;
+		if ((ecode = ntdb_store(ntdb, key, data, NTDB_INSERT)) != 0)
+			errx(1, "Inserting key %u in ntdb: %s",
+			     i, ntdb_errorstr(ecode));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && (ecode = ntdb_transaction_commit(ntdb)))
+		errx(1, "committing transaction: %s", ntdb_errorstr(ecode));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	if (ntdb_check(ntdb, NULL, NULL))
+		errx(1, "ntdb_check failed!");
+	if (summary) {
+		char *sumstr = NULL;
+		ntdb_summary(ntdb, NTDB_SUMMARY_HISTOGRAMS, &sumstr);
+		printf("%s\n", sumstr);
+		free(sumstr);
+	}
+	if (do_stats)
+		dump_and_clear_stats(&ntdb, flags, &log);
+	if (++stage == stopat)
+		exit(0);
+
+	return 0;
+}
diff --git a/lib/ntdb/transaction.c b/lib/ntdb/transaction.c
new file mode 100644
index 0000000000..76408c3022
--- /dev/null
+++ b/lib/ntdb/transaction.c
@@ -0,0 +1,1322 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              2005
+   Copyright (C) Rusty Russell                2010
+
+     ** NOTE! The following LGPL license applies to the ntdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "private.h"
+#define SAFE_FREE(x) do { if ((x) != NULL) {free((void *)x); (x)=NULL;} } while(0)
+
+/*
+  transaction design:
+
+  - only allow a single transaction at a time per database. This makes
+    using the transaction API simpler, as otherwise the caller would
+    have to cope with temporary failures in transactions that conflict
+    with other current transactions
+
+  - keep the transaction recovery information in the same file as the
+    database, using a special 'transaction recovery' record pointed at
+    by the header. This removes the need for extra journal files as
+    used by some other databases
+
+  - dynamically allocated the transaction recover record, re-using it
+    for subsequent transactions. If a larger record is needed then
+    ntdb_free() the old record to place it on the normal ntdb freelist
+    before allocating the new record
+
+  - during transactions, keep a linked list of writes all that have
+    been performed by intercepting all ntdb_write() calls. The hooked
+    transaction versions of ntdb_read() and ntdb_write() check this
+    linked list and try to use the elements of the list in preference
+    to the real database.
+
+  - don't allow any locks to be held when a transaction starts,
+    otherwise we can end up with deadlock (plus lack of lock nesting
+    in POSIX locks would mean the lock is lost)
+
+  - if the caller gains a lock during the transaction but doesn't
+    release it then fail the commit
+
+  - allow for nested calls to ntdb_transaction_start(), re-using the
+    existing transaction record. If the inner transaction is canceled
+    then a subsequent commit will fail
+
+  - keep a mirrored copy of the ntdb hash chain heads to allow for the
+    fast hash heads scan on traverse, updating the mirrored copy in
+    the transaction version of ntdb_write
+
+  - allow callers to mix transaction and non-transaction use of ntdb,
+    although once a transaction is started then an exclusive lock is
+    gained until the transaction is committed or canceled
+
+  - the commit stategy involves first saving away all modified data
+    into a linearised buffer in the transaction recovery area, then
+    marking the transaction recovery area with a magic value to
+    indicate a valid recovery record. In total 4 fsync/msync calls are
+    needed per commit to prevent race conditions. It might be possible
+    to reduce this to 3 or even 2 with some more work.
+
+  - check for a valid recovery record on open of the ntdb, while the
+    open lock is held. Automatically recover from the transaction
+    recovery area if needed, then continue with the open as
+    usual. This allows for smooth crash recovery with no administrator
+    intervention.
+
+  - if NTDB_NOSYNC is passed to flags in ntdb_open then transactions are
+    still available, but no transaction recovery area is used and no
+    fsync/msync calls are made.
+*/
+
+/*
+  hold the context of any current transaction
+*/
+struct ntdb_transaction {
+	/* the original io methods - used to do IOs to the real db */
+	const struct ntdb_methods *io_methods;
+
+	/* the list of transaction blocks. When a block is first
+	   written to, it gets created in this list */
+	uint8_t **blocks;
+	size_t num_blocks;
+	size_t last_block_size; /* number of valid bytes in the last block */
+
+	/* non-zero when an internal transaction error has
+	   occurred. All write operations will then fail until the
+	   transaction is ended */
+	int transaction_error;
+
+	/* when inside a transaction we need to keep track of any
+	   nested ntdb_transaction_start() calls, as these are allowed,
+	   but don't create a new transaction */
+	unsigned int nesting;
+
+	/* set when a prepare has already occurred */
+	bool prepared;
+	ntdb_off_t magic_offset;
+
+	/* old file size before transaction */
+	ntdb_len_t old_map_size;
+};
+
+/* This doesn't really need to be pagesize, but we use it for similar reasons. */
+#define PAGESIZE 65536
+
+/*
+  read while in a transaction. We need to check first if the data is in our list
+  of transaction elements, then if not do a real read
+*/
+static enum NTDB_ERROR transaction_read(struct ntdb_context *ntdb, ntdb_off_t off,
+				       void *buf, ntdb_len_t len)
+{
+	size_t blk;
+	enum NTDB_ERROR ecode;
+
+	/* break it down into block sized ops */
+	while (len + (off % PAGESIZE) > PAGESIZE) {
+		ntdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+		ecode = transaction_read(ntdb, off, buf, len2);
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+		len -= len2;
+		off += len2;
+		buf = (void *)(len2 + (char *)buf);
+	}
+
+	if (len == 0) {
+		return NTDB_SUCCESS;
+	}
+
+	blk = off / PAGESIZE;
+
+	/* see if we have it in the block list */
+	if (ntdb->transaction->num_blocks <= blk ||
+	    ntdb->transaction->blocks[blk] == NULL) {
+		/* nope, do a real read */
+		ecode = ntdb->transaction->io_methods->tread(ntdb, off, buf, len);
+		if (ecode != NTDB_SUCCESS) {
+			goto fail;
+		}
+		return 0;
+	}
+
+	/* it is in the block list. Now check for the last block */
+	if (blk == ntdb->transaction->num_blocks-1) {
+		if (len > ntdb->transaction->last_block_size) {
+			ecode = NTDB_ERR_IO;
+			goto fail;
+		}
+	}
+
+	/* now copy it out of this block */
+	memcpy(buf, ntdb->transaction->blocks[blk] + (off % PAGESIZE), len);
+	return NTDB_SUCCESS;
+
+fail:
+	ntdb->transaction->transaction_error = 1;
+	return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+			  "transaction_read: failed at off=%zu len=%zu",
+			  (size_t)off, (size_t)len);
+}
+
+
+/*
+  write while in a transaction
+*/
+static enum NTDB_ERROR transaction_write(struct ntdb_context *ntdb, ntdb_off_t off,
+					const void *buf, ntdb_len_t len)
+{
+	size_t blk;
+	enum NTDB_ERROR ecode;
+
+	/* Only a commit is allowed on a prepared transaction */
+	if (ntdb->transaction->prepared) {
+		ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_ERROR,
+				   "transaction_write: transaction already"
+				   " prepared, write not allowed");
+		goto fail;
+	}
+
+	/* break it up into block sized chunks */
+	while (len + (off % PAGESIZE) > PAGESIZE) {
+		ntdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+		ecode = transaction_write(ntdb, off, buf, len2);
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+		len -= len2;
+		off += len2;
+		if (buf != NULL) {
+			buf = (const void *)(len2 + (const char *)buf);
+		}
+	}
+
+	if (len == 0) {
+		return NTDB_SUCCESS;
+	}
+
+	blk = off / PAGESIZE;
+	off = off % PAGESIZE;
+
+	if (ntdb->transaction->num_blocks <= blk) {
+		uint8_t **new_blocks;
+		/* expand the blocks array */
+		if (ntdb->transaction->blocks == NULL) {
+			new_blocks = (uint8_t **)malloc(
+				(blk+1)*sizeof(uint8_t *));
+		} else {
+			new_blocks = (uint8_t **)realloc(
+				ntdb->transaction->blocks,
+				(blk+1)*sizeof(uint8_t *));
+		}
+		if (new_blocks == NULL) {
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+					   "transaction_write:"
+					   " failed to allocate");
+			goto fail;
+		}
+		memset(&new_blocks[ntdb->transaction->num_blocks], 0,
+		       (1+(blk - ntdb->transaction->num_blocks))*sizeof(uint8_t *));
+		ntdb->transaction->blocks = new_blocks;
+		ntdb->transaction->num_blocks = blk+1;
+		ntdb->transaction->last_block_size = 0;
+	}
+
+	/* allocate and fill a block? */
+	if (ntdb->transaction->blocks[blk] == NULL) {
+		ntdb->transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1);
+		if (ntdb->transaction->blocks[blk] == NULL) {
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+					   "transaction_write:"
+					   " failed to allocate");
+			goto fail;
+		}
+		if (ntdb->transaction->old_map_size > blk * PAGESIZE) {
+			ntdb_len_t len2 = PAGESIZE;
+			if (len2 + (blk * PAGESIZE) > ntdb->transaction->old_map_size) {
+				len2 = ntdb->transaction->old_map_size - (blk * PAGESIZE);
+			}
+			ecode = ntdb->transaction->io_methods->tread(ntdb,
+					blk * PAGESIZE,
+					ntdb->transaction->blocks[blk],
+					len2);
+			if (ecode != NTDB_SUCCESS) {
+				ecode = ntdb_logerr(ntdb, ecode,
+						   NTDB_LOG_ERROR,
+						   "transaction_write:"
+						   " failed to"
+						   " read old block: %s",
+						   strerror(errno));
+				SAFE_FREE(ntdb->transaction->blocks[blk]);
+				goto fail;
+			}
+			if (blk == ntdb->transaction->num_blocks-1) {
+				ntdb->transaction->last_block_size = len2;
+			}
+		}
+	}
+
+	/* overwrite part of an existing block */
+	if (buf == NULL) {
+		memset(ntdb->transaction->blocks[blk] + off, 0, len);
+	} else {
+		memcpy(ntdb->transaction->blocks[blk] + off, buf, len);
+	}
+	if (blk == ntdb->transaction->num_blocks-1) {
+		if (len + off > ntdb->transaction->last_block_size) {
+			ntdb->transaction->last_block_size = len + off;
+		}
+	}
+
+	return NTDB_SUCCESS;
+
+fail:
+	ntdb->transaction->transaction_error = 1;
+	return ecode;
+}
+
+
+/*
+  write while in a transaction - this variant never expands the transaction blocks, it only
+  updates existing blocks. This means it cannot change the recovery size
+*/
+static void transaction_write_existing(struct ntdb_context *ntdb, ntdb_off_t off,
+				       const void *buf, ntdb_len_t len)
+{
+	size_t blk;
+
+	/* break it up into block sized chunks */
+	while (len + (off % PAGESIZE) > PAGESIZE) {
+		ntdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+		transaction_write_existing(ntdb, off, buf, len2);
+		len -= len2;
+		off += len2;
+		if (buf != NULL) {
+			buf = (const void *)(len2 + (const char *)buf);
+		}
+	}
+
+	if (len == 0) {
+		return;
+	}
+
+	blk = off / PAGESIZE;
+	off = off % PAGESIZE;
+
+	if (ntdb->transaction->num_blocks <= blk ||
+	    ntdb->transaction->blocks[blk] == NULL) {
+		return;
+	}
+
+	if (blk == ntdb->transaction->num_blocks-1 &&
+	    off + len > ntdb->transaction->last_block_size) {
+		if (off >= ntdb->transaction->last_block_size) {
+			return;
+		}
+		len = ntdb->transaction->last_block_size - off;
+	}
+
+	/* overwrite part of an existing block */
+	memcpy(ntdb->transaction->blocks[blk] + off, buf, len);
+}
+
+
+/*
+  out of bounds check during a transaction
+*/
+static enum NTDB_ERROR transaction_oob(struct ntdb_context *ntdb,
+				      ntdb_off_t off, ntdb_len_t len, bool probe)
+{
+	if ((off + len >= off && off + len <= ntdb->file->map_size) || probe) {
+		return NTDB_SUCCESS;
+	}
+
+	ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+		   "ntdb_oob len %lld beyond transaction size %lld",
+		   (long long)(off + len),
+		   (long long)ntdb->file->map_size);
+	return NTDB_ERR_IO;
+}
+
+/*
+  transaction version of ntdb_expand().
+*/
+static enum NTDB_ERROR transaction_expand_file(struct ntdb_context *ntdb,
+					      ntdb_off_t addition)
+{
+	enum NTDB_ERROR ecode;
+
+	/* add a write to the transaction elements, so subsequent
+	   reads see the zero data */
+	ecode = transaction_write(ntdb, ntdb->file->map_size, NULL, addition);
+	if (ecode == NTDB_SUCCESS) {
+		ntdb->file->map_size += addition;
+	}
+	return ecode;
+}
+
+static void *transaction_direct(struct ntdb_context *ntdb, ntdb_off_t off,
+				size_t len, bool write_mode)
+{
+	size_t blk = off / PAGESIZE, end_blk;
+
+	/* This is wrong for zero-length blocks, but will fail gracefully */
+	end_blk = (off + len - 1) / PAGESIZE;
+
+	/* Can only do direct if in single block and we've already copied. */
+	if (write_mode) {
+		ntdb->stats.transaction_write_direct++;
+		if (blk != end_blk
+		    || blk >= ntdb->transaction->num_blocks
+		    || ntdb->transaction->blocks[blk] == NULL) {
+			ntdb->stats.transaction_write_direct_fail++;
+			return NULL;
+		}
+		return ntdb->transaction->blocks[blk] + off % PAGESIZE;
+	}
+
+	ntdb->stats.transaction_read_direct++;
+	/* Single which we have copied? */
+	if (blk == end_blk
+	    && blk < ntdb->transaction->num_blocks
+	    && ntdb->transaction->blocks[blk])
+		return ntdb->transaction->blocks[blk] + off % PAGESIZE;
+
+	/* Otherwise must be all not copied. */
+	while (blk <= end_blk) {
+		if (blk >= ntdb->transaction->num_blocks)
+			break;
+		if (ntdb->transaction->blocks[blk]) {
+			ntdb->stats.transaction_read_direct_fail++;
+			return NULL;
+		}
+		blk++;
+	}
+	return ntdb->transaction->io_methods->direct(ntdb, off, len, false);
+}
+
+static const struct ntdb_methods transaction_methods = {
+	transaction_read,
+	transaction_write,
+	transaction_oob,
+	transaction_expand_file,
+	transaction_direct,
+};
+
+/*
+  sync to disk
+*/
+static enum NTDB_ERROR transaction_sync(struct ntdb_context *ntdb,
+				       ntdb_off_t offset, ntdb_len_t length)
+{
+	if (ntdb->flags & NTDB_NOSYNC) {
+		return NTDB_SUCCESS;
+	}
+
+	if (fsync(ntdb->file->fd) != 0) {
+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+				  "ntdb_transaction: fsync failed: %s",
+				  strerror(errno));
+	}
+#ifdef MS_SYNC
+	if (ntdb->file->map_ptr) {
+		ntdb_off_t moffset = offset & ~(getpagesize()-1);
+		if (msync(moffset + (char *)ntdb->file->map_ptr,
+			  length + (offset - moffset), MS_SYNC) != 0) {
+			return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+					  "ntdb_transaction: msync failed: %s",
+					  strerror(errno));
+		}
+	}
+#endif
+	return NTDB_SUCCESS;
+}
+
+
+static void _ntdb_transaction_cancel(struct ntdb_context *ntdb)
+{
+	int i;
+	enum NTDB_ERROR ecode;
+
+	if (ntdb->transaction == NULL) {
+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
+			   "ntdb_transaction_cancel: no transaction");
+		return;
+	}
+
+	if (ntdb->transaction->nesting != 0) {
+		ntdb->transaction->transaction_error = 1;
+		ntdb->transaction->nesting--;
+		return;
+	}
+
+	ntdb->file->map_size = ntdb->transaction->old_map_size;
+
+	/* free all the transaction blocks */
+	for (i=0;i<ntdb->transaction->num_blocks;i++) {
+		if (ntdb->transaction->blocks[i] != NULL) {
+			free(ntdb->transaction->blocks[i]);
+		}
+	}
+	SAFE_FREE(ntdb->transaction->blocks);
+
+	if (ntdb->transaction->magic_offset) {
+		const struct ntdb_methods *methods = ntdb->transaction->io_methods;
+		uint64_t invalid = NTDB_RECOVERY_INVALID_MAGIC;
+
+		/* remove the recovery marker */
+		ecode = methods->twrite(ntdb, ntdb->transaction->magic_offset,
+					&invalid, sizeof(invalid));
+		if (ecode == NTDB_SUCCESS)
+			ecode = transaction_sync(ntdb,
+						 ntdb->transaction->magic_offset,
+						 sizeof(invalid));
+		if (ecode != NTDB_SUCCESS) {
+			ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				   "ntdb_transaction_cancel: failed to remove"
+				   " recovery magic");
+		}
+	}
+
+	if (ntdb->file->allrecord_lock.count)
+		ntdb_allrecord_unlock(ntdb, ntdb->file->allrecord_lock.ltype);
+
+	/* restore the normal io methods */
+	ntdb->io = ntdb->transaction->io_methods;
+
+	ntdb_transaction_unlock(ntdb, F_WRLCK);
+
+	if (ntdb_has_open_lock(ntdb))
+		ntdb_unlock_open(ntdb, F_WRLCK);
+
+	SAFE_FREE(ntdb->transaction);
+}
+
+/*
+  start a ntdb transaction. No token is returned, as only a single
+  transaction is allowed to be pending per ntdb_context
+*/
+_PUBLIC_ enum NTDB_ERROR ntdb_transaction_start(struct ntdb_context *ntdb)
+{
+	enum NTDB_ERROR ecode;
+
+	ntdb->stats.transactions++;
+	/* some sanity checks */
+	if (ntdb->flags & NTDB_INTERNAL) {
+		return ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+						    NTDB_LOG_USE_ERROR,
+						    "ntdb_transaction_start:"
+						    " cannot start a"
+						    " transaction on an"
+						    " internal ntdb");
+	}
+
+	if (ntdb->flags & NTDB_RDONLY) {
+		return ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_RDONLY,
+						    NTDB_LOG_USE_ERROR,
+						    "ntdb_transaction_start:"
+						    " cannot start a"
+						    " transaction on a "
+						    " read-only ntdb");
+	}
+
+	/* cope with nested ntdb_transaction_start() calls */
+	if (ntdb->transaction != NULL) {
+		if (!(ntdb->flags & NTDB_ALLOW_NESTING)) {
+			return ntdb->last_error
+				= ntdb_logerr(ntdb, NTDB_ERR_IO,
+					     NTDB_LOG_USE_ERROR,
+					     "ntdb_transaction_start:"
+					     " already inside transaction");
+		}
+		ntdb->transaction->nesting++;
+		ntdb->stats.transaction_nest++;
+		return 0;
+	}
+
+	if (ntdb_has_hash_locks(ntdb)) {
+		/* the caller must not have any locks when starting a
+		   transaction as otherwise we'll be screwed by lack
+		   of nested locks in POSIX */
+		return ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_LOCK,
+						    NTDB_LOG_USE_ERROR,
+						    "ntdb_transaction_start:"
+						    " cannot start a"
+						    " transaction with locks"
+						    " held");
+	}
+
+	ntdb->transaction = (struct ntdb_transaction *)
+		calloc(sizeof(struct ntdb_transaction), 1);
+	if (ntdb->transaction == NULL) {
+		return ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_OOM,
+						    NTDB_LOG_ERROR,
+						    "ntdb_transaction_start:"
+						    " cannot allocate");
+	}
+
+	/* get the transaction write lock. This is a blocking lock. As
+	   discussed with Volker, there are a number of ways we could
+	   make this async, which we will probably do in the future */
+	ecode = ntdb_transaction_lock(ntdb, F_WRLCK);
+	if (ecode != NTDB_SUCCESS) {
+		SAFE_FREE(ntdb->transaction->blocks);
+		SAFE_FREE(ntdb->transaction);
+		return ntdb->last_error = ecode;
+	}
+
+	/* get a read lock over entire file. This is upgraded to a write
+	   lock during the commit */
+	ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, true);
+	if (ecode != NTDB_SUCCESS) {
+		goto fail_allrecord_lock;
+	}
+
+	/* make sure we know about any file expansions already done by
+	   anyone else */
+	ntdb->io->oob(ntdb, ntdb->file->map_size, 1, true);
+	ntdb->transaction->old_map_size = ntdb->file->map_size;
+
+	/* finally hook the io methods, replacing them with
+	   transaction specific methods */
+	ntdb->transaction->io_methods = ntdb->io;
+	ntdb->io = &transaction_methods;
+	return ntdb->last_error = NTDB_SUCCESS;
+
+fail_allrecord_lock:
+	ntdb_transaction_unlock(ntdb, F_WRLCK);
+	SAFE_FREE(ntdb->transaction->blocks);
+	SAFE_FREE(ntdb->transaction);
+	return ntdb->last_error = ecode;
+}
+
+
+/*
+  cancel the current transaction
+*/
+_PUBLIC_ void ntdb_transaction_cancel(struct ntdb_context *ntdb)
+{
+	ntdb->stats.transaction_cancel++;
+	_ntdb_transaction_cancel(ntdb);
+}
+
+/*
+  work out how much space the linearised recovery data will consume (worst case)
+*/
+static ntdb_len_t ntdb_recovery_size(struct ntdb_context *ntdb)
+{
+	ntdb_len_t recovery_size = 0;
+	int i;
+
+	recovery_size = 0;
+	for (i=0;i<ntdb->transaction->num_blocks;i++) {
+		if (i * PAGESIZE >= ntdb->transaction->old_map_size) {
+			break;
+		}
+		if (ntdb->transaction->blocks[i] == NULL) {
+			continue;
+		}
+		recovery_size += 2*sizeof(ntdb_off_t);
+		if (i == ntdb->transaction->num_blocks-1) {
+			recovery_size += ntdb->transaction->last_block_size;
+		} else {
+			recovery_size += PAGESIZE;
+		}
+	}
+
+	return recovery_size;
+}
+
+static enum NTDB_ERROR ntdb_recovery_area(struct ntdb_context *ntdb,
+					const struct ntdb_methods *methods,
+					ntdb_off_t *recovery_offset,
+					struct ntdb_recovery_record *rec)
+{
+	enum NTDB_ERROR ecode;
+
+	*recovery_offset = ntdb_read_off(ntdb,
+					offsetof(struct ntdb_header, recovery));
+	if (NTDB_OFF_IS_ERR(*recovery_offset)) {
+		return NTDB_OFF_TO_ERR(*recovery_offset);
+	}
+
+	if (*recovery_offset == 0) {
+		rec->max_len = 0;
+		return NTDB_SUCCESS;
+	}
+
+	ecode = methods->tread(ntdb, *recovery_offset, rec, sizeof(*rec));
+	if (ecode != NTDB_SUCCESS)
+		return ecode;
+
+	ntdb_convert(ntdb, rec, sizeof(*rec));
+	/* ignore invalid recovery regions: can happen in crash */
+	if (rec->magic != NTDB_RECOVERY_MAGIC &&
+	    rec->magic != NTDB_RECOVERY_INVALID_MAGIC) {
+		*recovery_offset = 0;
+		rec->max_len = 0;
+	}
+	return NTDB_SUCCESS;
+}
+
+static unsigned int same(const unsigned char *new,
+			 const unsigned char *old,
+			 unsigned int length)
+{
+	unsigned int i;
+
+	for (i = 0; i < length; i++) {
+		if (new[i] != old[i])
+			break;
+	}
+	return i;
+}
+
+static unsigned int different(const unsigned char *new,
+			      const unsigned char *old,
+			      unsigned int length,
+			      unsigned int min_same,
+			      unsigned int *samelen)
+{
+	unsigned int i;
+
+	*samelen = 0;
+	for (i = 0; i < length; i++) {
+		if (new[i] == old[i]) {
+			(*samelen)++;
+		} else {
+			if (*samelen >= min_same) {
+				return i - *samelen;
+			}
+			*samelen = 0;
+		}
+	}
+
+	if (*samelen < min_same)
+		*samelen = 0;
+	return length - *samelen;
+}
+
+/* Allocates recovery blob, without ntdb_recovery_record at head set up. */
+static struct ntdb_recovery_record *alloc_recovery(struct ntdb_context *ntdb,
+						  ntdb_len_t *len)
+{
+	struct ntdb_recovery_record *rec;
+	size_t i;
+	enum NTDB_ERROR ecode;
+	unsigned char *p;
+	const struct ntdb_methods *old_methods = ntdb->io;
+
+	rec = malloc(sizeof(*rec) + ntdb_recovery_size(ntdb));
+	if (!rec) {
+		ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+			   "transaction_setup_recovery:"
+			   " cannot allocate");
+		return NTDB_ERR_PTR(NTDB_ERR_OOM);
+	}
+
+	/* We temporarily revert to the old I/O methods, so we can use
+	 * ntdb_access_read */
+	ntdb->io = ntdb->transaction->io_methods;
+
+	/* build the recovery data into a single blob to allow us to do a single
+	   large write, which should be more efficient */
+	p = (unsigned char *)(rec + 1);
+	for (i=0;i<ntdb->transaction->num_blocks;i++) {
+		ntdb_off_t offset;
+		ntdb_len_t length;
+		unsigned int off;
+		const unsigned char *buffer;
+
+		if (ntdb->transaction->blocks[i] == NULL) {
+			continue;
+		}
+
+		offset = i * PAGESIZE;
+		length = PAGESIZE;
+		if (i == ntdb->transaction->num_blocks-1) {
+			length = ntdb->transaction->last_block_size;
+		}
+
+		if (offset >= ntdb->transaction->old_map_size) {
+			continue;
+		}
+
+		if (offset + length > ntdb->file->map_size) {
+			ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+					   "ntdb_transaction_setup_recovery:"
+					   " transaction data over new region"
+					   " boundary");
+			goto fail;
+		}
+		if (offset + length > ntdb->transaction->old_map_size) {
+			/* Short read at EOF. */
+			length = ntdb->transaction->old_map_size - offset;
+		}
+		buffer = ntdb_access_read(ntdb, offset, length, false);
+		if (NTDB_PTR_IS_ERR(buffer)) {
+			ecode = NTDB_PTR_ERR(buffer);
+			goto fail;
+		}
+
+		/* Skip over anything the same at the start. */
+		off = same(ntdb->transaction->blocks[i], buffer, length);
+		offset += off;
+
+		while (off < length) {
+			ntdb_len_t len1;
+			unsigned int samelen;
+
+			len1 = different(ntdb->transaction->blocks[i] + off,
+					buffer + off, length - off,
+					sizeof(offset) + sizeof(len1) + 1,
+					&samelen);
+
+			memcpy(p, &offset, sizeof(offset));
+			memcpy(p + sizeof(offset), &len1, sizeof(len1));
+			ntdb_convert(ntdb, p, sizeof(offset) + sizeof(len1));
+			p += sizeof(offset) + sizeof(len1);
+			memcpy(p, buffer + off, len1);
+			p += len1;
+			off += len1 + samelen;
+			offset += len1 + samelen;
+		}
+		ntdb_access_release(ntdb, buffer);
+	}
+
+	*len = p - (unsigned char *)(rec + 1);
+	ntdb->io = old_methods;
+	return rec;
+
+fail:
+	free(rec);
+	ntdb->io = old_methods;
+	return NTDB_ERR_PTR(ecode);
+}
+
+static ntdb_off_t create_recovery_area(struct ntdb_context *ntdb,
+				      ntdb_len_t rec_length,
+				      struct ntdb_recovery_record *rec)
+{
+	ntdb_off_t off, recovery_off;
+	ntdb_len_t addition;
+	enum NTDB_ERROR ecode;
+	const struct ntdb_methods *methods = ntdb->transaction->io_methods;
+
+	/* round up to a multiple of page size. Overallocate, since each
+	 * such allocation forces us to expand the file. */
+	rec->max_len = ntdb_expand_adjust(ntdb->file->map_size, rec_length);
+
+	/* Round up to a page. */
+	rec->max_len = ((sizeof(*rec) + rec->max_len + PAGESIZE-1)
+			& ~(PAGESIZE-1))
+		- sizeof(*rec);
+
+	off = ntdb->file->map_size;
+
+	/* Restore ->map_size before calling underlying expand_file.
+	   Also so that we don't try to expand the file again in the
+	   transaction commit, which would destroy the recovery
+	   area */
+	addition = (ntdb->file->map_size - ntdb->transaction->old_map_size) +
+		sizeof(*rec) + rec->max_len;
+	ntdb->file->map_size = ntdb->transaction->old_map_size;
+	ntdb->stats.transaction_expand_file++;
+	ecode = methods->expand_file(ntdb, addition);
+	if (ecode != NTDB_SUCCESS) {
+		ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+			   "ntdb_recovery_allocate:"
+			   " failed to create recovery area");
+		return NTDB_ERR_TO_OFF(ecode);
+	}
+
+	/* we have to reset the old map size so that we don't try to
+	   expand the file again in the transaction commit, which
+	   would destroy the recovery area */
+	ntdb->transaction->old_map_size = ntdb->file->map_size;
+
+	/* write the recovery header offset and sync - we can sync without a race here
+	   as the magic ptr in the recovery record has not been set */
+	recovery_off = off;
+	ntdb_convert(ntdb, &recovery_off, sizeof(recovery_off));
+	ecode = methods->twrite(ntdb, offsetof(struct ntdb_header, recovery),
+				&recovery_off, sizeof(ntdb_off_t));
+	if (ecode != NTDB_SUCCESS) {
+		ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+			   "ntdb_recovery_allocate:"
+			   " failed to write recovery head");
+		return NTDB_ERR_TO_OFF(ecode);
+	}
+	transaction_write_existing(ntdb, offsetof(struct ntdb_header, recovery),
+				   &recovery_off,
+				   sizeof(ntdb_off_t));
+	return off;
+}
+
+/*
+  setup the recovery data that will be used on a crash during commit
+*/
+static enum NTDB_ERROR transaction_setup_recovery(struct ntdb_context *ntdb)
+{
+	ntdb_len_t recovery_size = 0;
+	ntdb_off_t recovery_off = 0;
+	ntdb_off_t old_map_size = ntdb->transaction->old_map_size;
+	struct ntdb_recovery_record *recovery;
+	const struct ntdb_methods *methods = ntdb->transaction->io_methods;
+	uint64_t magic;
+	enum NTDB_ERROR ecode;
+
+	recovery = alloc_recovery(ntdb, &recovery_size);
+	if (NTDB_PTR_IS_ERR(recovery))
+		return NTDB_PTR_ERR(recovery);
+
+	ecode = ntdb_recovery_area(ntdb, methods, &recovery_off, recovery);
+	if (ecode) {
+		free(recovery);
+		return ecode;
+	}
+
+	if (recovery->max_len < recovery_size) {
+		/* Not large enough. Free up old recovery area. */
+		if (recovery_off) {
+			ntdb->stats.frees++;
+			ecode = add_free_record(ntdb, recovery_off,
+						sizeof(*recovery)
+						+ recovery->max_len,
+						NTDB_LOCK_WAIT, true);
+			free(recovery);
+			if (ecode != NTDB_SUCCESS) {
+				return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+						  "ntdb_recovery_allocate:"
+						  " failed to free previous"
+						  " recovery area");
+			}
+
+			/* Refresh recovery after add_free_record above. */
+			recovery = alloc_recovery(ntdb, &recovery_size);
+			if (NTDB_PTR_IS_ERR(recovery))
+				return NTDB_PTR_ERR(recovery);
+		}
+
+		recovery_off = create_recovery_area(ntdb, recovery_size,
+						    recovery);
+		if (NTDB_OFF_IS_ERR(recovery_off)) {
+			free(recovery);
+			return NTDB_OFF_TO_ERR(recovery_off);
+		}
+	}
+
+	/* Now we know size, convert rec header. */
+	recovery->magic = NTDB_RECOVERY_INVALID_MAGIC;
+	recovery->len = recovery_size;
+	recovery->eof = old_map_size;
+	ntdb_convert(ntdb, recovery, sizeof(*recovery));
+
+	/* write the recovery data to the recovery area */
+	ecode = methods->twrite(ntdb, recovery_off, recovery, recovery_size);
+	if (ecode != NTDB_SUCCESS) {
+		free(recovery);
+		return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				  "ntdb_transaction_setup_recovery:"
+				  " failed to write recovery data");
+	}
+	transaction_write_existing(ntdb, recovery_off, recovery, recovery_size);
+
+	free(recovery);
+
+	/* as we don't have ordered writes, we have to sync the recovery
+	   data before we update the magic to indicate that the recovery
+	   data is present */
+	ecode = transaction_sync(ntdb, recovery_off, recovery_size);
+	if (ecode != NTDB_SUCCESS)
+		return ecode;
+
+	magic = NTDB_RECOVERY_MAGIC;
+	ntdb_convert(ntdb, &magic, sizeof(magic));
+
+	ntdb->transaction->magic_offset
+		= recovery_off + offsetof(struct ntdb_recovery_record, magic);
+
+	ecode = methods->twrite(ntdb, ntdb->transaction->magic_offset,
+				&magic, sizeof(magic));
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				  "ntdb_transaction_setup_recovery:"
+				  " failed to write recovery magic");
+	}
+	transaction_write_existing(ntdb, ntdb->transaction->magic_offset,
+				   &magic, sizeof(magic));
+
+	/* ensure the recovery magic marker is on disk */
+	return transaction_sync(ntdb, ntdb->transaction->magic_offset,
+				sizeof(magic));
+}
+
+static enum NTDB_ERROR _ntdb_transaction_prepare_commit(struct ntdb_context *ntdb)
+{
+	const struct ntdb_methods *methods;
+	enum NTDB_ERROR ecode;
+
+	if (ntdb->transaction == NULL) {
+		return ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
+				  "ntdb_transaction_prepare_commit:"
+				  " no transaction");
+	}
+
+	if (ntdb->transaction->prepared) {
+		_ntdb_transaction_cancel(ntdb);
+		return ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
+				  "ntdb_transaction_prepare_commit:"
+				  " transaction already prepared");
+	}
+
+	if (ntdb->transaction->transaction_error) {
+		_ntdb_transaction_cancel(ntdb);
+		return ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_ERROR,
+				  "ntdb_transaction_prepare_commit:"
+				  " transaction error pending");
+	}
+
+
+	if (ntdb->transaction->nesting != 0) {
+		return NTDB_SUCCESS;
+	}
+
+	/* check for a null transaction */
+	if (ntdb->transaction->blocks == NULL) {
+		return NTDB_SUCCESS;
+	}
+
+	methods = ntdb->transaction->io_methods;
+
+	/* upgrade the main transaction lock region to a write lock */
+	ecode = ntdb_allrecord_upgrade(ntdb, NTDB_HASH_LOCK_START);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* get the open lock - this prevents new users attaching to the database
+	   during the commit */
+	ecode = ntdb_lock_open(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
+	if (ecode != NTDB_SUCCESS) {
+		return ecode;
+	}
+
+	/* Since we have whole db locked, we don't need the expansion lock. */
+	if (!(ntdb->flags & NTDB_NOSYNC)) {
+		/* Sets up ntdb->transaction->recovery and
+		 * ntdb->transaction->magic_offset. */
+		ecode = transaction_setup_recovery(ntdb);
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+	}
+
+	ntdb->transaction->prepared = true;
+
+	/* expand the file to the new size if needed */
+	if (ntdb->file->map_size != ntdb->transaction->old_map_size) {
+		ntdb_len_t add;
+
+		add = ntdb->file->map_size - ntdb->transaction->old_map_size;
+		/* Restore original map size for ntdb_expand_file */
+		ntdb->file->map_size = ntdb->transaction->old_map_size;
+		ecode = methods->expand_file(ntdb, add);
+		if (ecode != NTDB_SUCCESS) {
+			return ecode;
+		}
+	}
+
+	/* Keep the open lock until the actual commit */
+	return NTDB_SUCCESS;
+}
+
+/*
+   prepare to commit the current transaction
+*/
+_PUBLIC_ enum NTDB_ERROR ntdb_transaction_prepare_commit(struct ntdb_context *ntdb)
+{
+	return ntdb->last_error = _ntdb_transaction_prepare_commit(ntdb);
+}
+
+/*
+  commit the current transaction
+*/
+_PUBLIC_ enum NTDB_ERROR ntdb_transaction_commit(struct ntdb_context *ntdb)
+{
+	const struct ntdb_methods *methods;
+	int i;
+	enum NTDB_ERROR ecode;
+
+	if (ntdb->transaction == NULL) {
+		return ntdb->last_error = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
+						    NTDB_LOG_USE_ERROR,
+						    "ntdb_transaction_commit:"
+						    " no transaction");
+	}
+
+	ntdb_trace(ntdb, "ntdb_transaction_commit");
+
+	if (ntdb->transaction->nesting != 0) {
+		ntdb->transaction->nesting--;
+		return ntdb->last_error = NTDB_SUCCESS;
+	}
+
+	/* check for a null transaction */
+	if (ntdb->transaction->blocks == NULL) {
+		_ntdb_transaction_cancel(ntdb);
+		return ntdb->last_error = NTDB_SUCCESS;
+	}
+
+	if (!ntdb->transaction->prepared) {
+		ecode = _ntdb_transaction_prepare_commit(ntdb);
+		if (ecode != NTDB_SUCCESS) {
+			_ntdb_transaction_cancel(ntdb);
+			return ntdb->last_error = ecode;
+		}
+	}
+
+	methods = ntdb->transaction->io_methods;
+
+	/* perform all the writes */
+	for (i=0;i<ntdb->transaction->num_blocks;i++) {
+		ntdb_off_t offset;
+		ntdb_len_t length;
+
+		if (ntdb->transaction->blocks[i] == NULL) {
+			continue;
+		}
+
+		offset = i * PAGESIZE;
+		length = PAGESIZE;
+		if (i == ntdb->transaction->num_blocks-1) {
+			length = ntdb->transaction->last_block_size;
+		}
+
+		ecode = methods->twrite(ntdb, offset,
+					ntdb->transaction->blocks[i], length);
+		if (ecode != NTDB_SUCCESS) {
+			/* we've overwritten part of the data and
+			   possibly expanded the file, so we need to
+			   run the crash recovery code */
+			ntdb->io = methods;
+			ntdb_transaction_recover(ntdb);
+
+			_ntdb_transaction_cancel(ntdb);
+
+			return ntdb->last_error = ecode;
+		}
+		SAFE_FREE(ntdb->transaction->blocks[i]);
+	}
+
+	SAFE_FREE(ntdb->transaction->blocks);
+	ntdb->transaction->num_blocks = 0;
+
+	/* ensure the new data is on disk */
+	ecode = transaction_sync(ntdb, 0, ntdb->file->map_size);
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb->last_error = ecode;
+	}
+
+	/*
+	  TODO: maybe write to some dummy hdr field, or write to magic
+	  offset without mmap, before the last sync, instead of the
+	  utime() call
+	*/
+
+	/* on some systems (like Linux 2.6.x) changes via mmap/msync
+	   don't change the mtime of the file, this means the file may
+	   not be backed up (as ntdb rounding to block sizes means that
+	   file size changes are quite rare too). The following forces
+	   mtime changes when a transaction completes */
+#if HAVE_UTIME
+	utime(ntdb->name, NULL);
+#endif
+
+	/* use a transaction cancel to free memory and remove the
+	   transaction locks: it "restores" map_size, too. */
+	ntdb->transaction->old_map_size = ntdb->file->map_size;
+	_ntdb_transaction_cancel(ntdb);
+
+	return ntdb->last_error = NTDB_SUCCESS;
+}
+
+
+/*
+  recover from an aborted transaction. Must be called with exclusive
+  database write access already established (including the open
+  lock to prevent new processes attaching)
+*/
+enum NTDB_ERROR ntdb_transaction_recover(struct ntdb_context *ntdb)
+{
+	ntdb_off_t recovery_head, recovery_eof;
+	unsigned char *data, *p;
+	struct ntdb_recovery_record rec;
+	enum NTDB_ERROR ecode;
+
+	/* find the recovery area */
+	recovery_head = ntdb_read_off(ntdb, offsetof(struct ntdb_header,recovery));
+	if (NTDB_OFF_IS_ERR(recovery_head)) {
+		ecode = NTDB_OFF_TO_ERR(recovery_head);
+		return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				  "ntdb_transaction_recover:"
+				  " failed to read recovery head");
+	}
+
+	if (recovery_head == 0) {
+		/* we have never allocated a recovery record */
+		return NTDB_SUCCESS;
+	}
+
+	/* read the recovery record */
+	ecode = ntdb_read_convert(ntdb, recovery_head, &rec, sizeof(rec));
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				  "ntdb_transaction_recover:"
+				  " failed to read recovery record");
+	}
+
+	if (rec.magic != NTDB_RECOVERY_MAGIC) {
+		/* there is no valid recovery data */
+		return NTDB_SUCCESS;
+	}
+
+	if (ntdb->flags & NTDB_RDONLY) {
+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+				  "ntdb_transaction_recover:"
+				  " attempt to recover read only database");
+	}
+
+	recovery_eof = rec.eof;
+
+	data = (unsigned char *)malloc(rec.len);
+	if (data == NULL) {
+		return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+				  "ntdb_transaction_recover:"
+				  " failed to allocate recovery data");
+	}
+
+	/* read the full recovery data */
+	ecode = ntdb->io->tread(ntdb, recovery_head + sizeof(rec), data,
+				    rec.len);
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				  "ntdb_transaction_recover:"
+				  " failed to read recovery data");
+	}
+
+	/* recover the file data */
+	p = data;
+	while (p+sizeof(ntdb_off_t)+sizeof(ntdb_len_t) < data + rec.len) {
+		ntdb_off_t ofs;
+		ntdb_len_t len;
+		ntdb_convert(ntdb, p, sizeof(ofs) + sizeof(len));
+		memcpy(&ofs, p, sizeof(ofs));
+		memcpy(&len, p + sizeof(ofs), sizeof(len));
+		p += sizeof(ofs) + sizeof(len);
+
+		ecode = ntdb->io->twrite(ntdb, ofs, p, len);
+		if (ecode != NTDB_SUCCESS) {
+			free(data);
+			return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+					  "ntdb_transaction_recover:"
+					  " failed to recover %zu bytes"
+					  " at offset %zu",
+					  (size_t)len, (size_t)ofs);
+		}
+		p += len;
+	}
+
+	free(data);
+
+	ecode = transaction_sync(ntdb, 0, ntdb->file->map_size);
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				  "ntdb_transaction_recover:"
+				  " failed to sync recovery");
+	}
+
+	/* if the recovery area is after the recovered eof then remove it */
+	if (recovery_eof <= recovery_head) {
+		ecode = ntdb_write_off(ntdb, offsetof(struct ntdb_header,
+						    recovery),
+				      0);
+		if (ecode != NTDB_SUCCESS) {
+			return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+					  "ntdb_transaction_recover:"
+					  " failed to remove recovery head");
+		}
+	}
+
+	/* remove the recovery magic */
+	ecode = ntdb_write_off(ntdb,
+			      recovery_head
+			      + offsetof(struct ntdb_recovery_record, magic),
+			      NTDB_RECOVERY_INVALID_MAGIC);
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				  "ntdb_transaction_recover:"
+				  " failed to remove recovery magic");
+	}
+
+	ecode = transaction_sync(ntdb, 0, recovery_eof);
+	if (ecode != NTDB_SUCCESS) {
+		return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
+				  "ntdb_transaction_recover:"
+				  " failed to sync2 recovery");
+	}
+
+	ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
+		   "ntdb_transaction_recover: recovered %zu byte database",
+		   (size_t)recovery_eof);
+
+	/* all done */
+	return NTDB_SUCCESS;
+}
+
+ntdb_bool_err ntdb_needs_recovery(struct ntdb_context *ntdb)
+{
+	ntdb_off_t recovery_head;
+	struct ntdb_recovery_record rec;
+	enum NTDB_ERROR ecode;
+
+	/* find the recovery area */
+	recovery_head = ntdb_read_off(ntdb, offsetof(struct ntdb_header,recovery));
+	if (NTDB_OFF_IS_ERR(recovery_head)) {
+		return recovery_head;
+	}
+
+	if (recovery_head == 0) {
+		/* we have never allocated a recovery record */
+		return false;
+	}
+
+	/* read the recovery record */
+	ecode = ntdb_read_convert(ntdb, recovery_head, &rec, sizeof(rec));
+	if (ecode != NTDB_SUCCESS) {
+		return NTDB_ERR_TO_OFF(ecode);
+	}
+
+	return (rec.magic == NTDB_RECOVERY_MAGIC);
+}
diff --git a/lib/ntdb/traverse.c b/lib/ntdb/traverse.c
new file mode 100644
index 0000000000..52bf75c684
--- /dev/null
+++ b/lib/ntdb/traverse.c
@@ -0,0 +1,99 @@
+ /*
+   Trivial Database 2: traverse function.
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+
+_PUBLIC_ int64_t ntdb_traverse_(struct ntdb_context *ntdb,
+		      int (*fn)(struct ntdb_context *,
+				NTDB_DATA, NTDB_DATA, void *),
+		      void *p)
+{
+	enum NTDB_ERROR ecode;
+	struct traverse_info tinfo;
+	NTDB_DATA k, d;
+	int64_t count = 0;
+
+	k.dptr = NULL;
+	for (ecode = first_in_hash(ntdb, &tinfo, &k, &d.dsize);
+	     ecode == NTDB_SUCCESS;
+	     ecode = next_in_hash(ntdb, &tinfo, &k, &d.dsize)) {
+		d.dptr = k.dptr + k.dsize;
+
+		count++;
+		if (fn && fn(ntdb, k, d, p)) {
+			free(k.dptr);
+			ntdb->last_error = NTDB_SUCCESS;
+			return count;
+		}
+		free(k.dptr);
+	}
+
+	if (ecode != NTDB_ERR_NOEXIST) {
+		return NTDB_ERR_TO_OFF(ntdb->last_error = ecode);
+	}
+	ntdb->last_error = NTDB_SUCCESS;
+	return count;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_firstkey(struct ntdb_context *ntdb, NTDB_DATA *key)
+{
+	struct traverse_info tinfo;
+
+	return ntdb->last_error = first_in_hash(ntdb, &tinfo, key, NULL);
+}
+
+/* We lock twice, not very efficient.  We could keep last key & tinfo cached. */
+_PUBLIC_ enum NTDB_ERROR ntdb_nextkey(struct ntdb_context *ntdb, NTDB_DATA *key)
+{
+	struct traverse_info tinfo;
+	struct hash_info h;
+	struct ntdb_used_record rec;
+
+	tinfo.prev = find_and_lock(ntdb, *key, F_RDLCK, &h, &rec, &tinfo);
+	free(key->dptr);
+	if (NTDB_OFF_IS_ERR(tinfo.prev)) {
+		return ntdb->last_error = NTDB_OFF_TO_ERR(tinfo.prev);
+	}
+	ntdb_unlock_hashes(ntdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+	return ntdb->last_error = next_in_hash(ntdb, &tinfo, key, NULL);
+}
+
+static int wipe_one(struct ntdb_context *ntdb,
+		    NTDB_DATA key, NTDB_DATA data, enum NTDB_ERROR *ecode)
+{
+	*ecode = ntdb_delete(ntdb, key);
+	return (*ecode != NTDB_SUCCESS);
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_wipe_all(struct ntdb_context *ntdb)
+{
+	enum NTDB_ERROR ecode;
+	int64_t count;
+
+	ecode = ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT, false);
+	if (ecode != NTDB_SUCCESS)
+		return ntdb->last_error = ecode;
+
+	/* FIXME: Be smarter. */
+	count = ntdb_traverse(ntdb, wipe_one, &ecode);
+	if (count < 0)
+		ecode = NTDB_OFF_TO_ERR(count);
+	ntdb_allrecord_unlock(ntdb, F_WRLCK);
+	return ntdb->last_error = ecode;
+}
diff --git a/lib/ntdb/wscript b/lib/ntdb/wscript
new file mode 100644
index 0000000000..e6feb14f12
--- /dev/null
+++ b/lib/ntdb/wscript
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+
+APPNAME = 'ntdb'
+VERSION = '0.9'
+
+blddir = 'bin'
+
+import sys, os
+
+# find the buildtools directory
+srcdir = '.'
+while not os.path.exists(srcdir+'/buildtools') and len(srcdir.split('/')) < 5:
+    srcdir = '../' + srcdir
+sys.path.insert(0, srcdir + '/buildtools/wafsamba')
+
+import wafsamba, samba_dist, Options, Logs, glob
+
+samba_dist.DIST_DIRS('lib/ntdb:. lib/replace:lib/replace lib/ccan:lib/ccan buildtools:buildtools')
+
+def set_options(opt):
+    opt.BUILTIN_DEFAULT('replace,ccan')
+    opt.PRIVATE_EXTENSION_DEFAULT('ntdb', noextension='ntdb')
+    opt.RECURSE('lib/replace')
+    opt.add_option('--valgrind',
+                   help=("use valgrind on tests programs"),
+                   action="store_true", dest='VALGRIND', default=False)
+    opt.add_option('--valgrind-log',
+                   help=("where to put the valgrind log"),
+                   action="store", dest='VALGRINDLOG', default=None)
+    if opt.IN_LAUNCH_DIR():
+        opt.add_option('--disable-python',
+                       help=("disable the pyntdb module"),
+                       action="store_true", dest='disable_python', default=False)
+
+def configure(conf):
+    conf.RECURSE('lib/replace')
+    conf.RECURSE('lib/ccan')
+
+    conf.env.NTDB_TEST_RUN_SRC=['test/run-001-encode.c',
+                                'test/run-001-fls.c',
+                                'test/run-01-new_database.c',
+                                'test/run-02-expand.c',
+                                'test/run-03-coalesce.c',
+                                'test/run-04-basichash.c',
+                                'test/run-05-readonly-open.c',
+                                'test/run-10-simple-store.c',
+                                'test/run-11-simple-fetch.c',
+                                'test/run-12-check.c',
+                                'test/run-15-append.c',
+                                'test/run-20-growhash.c',
+                                'test/run-25-hashoverload.c',
+                                'test/run-30-exhaust-before-expand.c',
+                                'test/run-35-convert.c',
+                                'test/run-50-multiple-freelists.c',
+                                'test/run-56-open-during-transaction.c',
+                                'test/run-57-die-during-transaction.c',
+                                'test/run-64-bit-tdb.c',
+                                'test/run-90-get-set-attributes.c',
+                                'test/run-capabilities.c',
+                                'test/run-expand-in-transaction.c',
+                                'test/run-features.c',
+                                'test/run-lockall.c',
+                                'test/run-remap-in-read_traverse.c',
+                                'test/run-seed.c',
+                                'test/run-tdb_errorstr.c',
+                                'test/run-tdb_foreach.c',
+                                'test/run-traverse.c']
+    conf.env.NTDB_TEST_API_SRC=['test/api-12-store.c',
+                                'test/api-13-delete.c',
+                                'test/api-14-exists.c',
+                                'test/api-16-wipe_all.c',
+                                'test/api-21-parse_record.c',
+                                'test/api-55-transaction.c',
+                                'test/api-80-tdb_fd.c',
+                                'test/api-81-seqnum.c',
+                                'test/api-82-lockattr.c',
+                                'test/api-83-openhook.c',
+                                'test/api-91-get-stats.c',
+                                'test/api-92-get-set-readonly.c',
+                                'test/api-93-repack.c',
+                                'test/api-add-remove-flags.c',
+                                'test/api-check-callback.c',
+                                'test/api-firstkey-nextkey.c',
+                                'test/api-fork-test.c',
+                                'test/api-locktimeout.c',
+                                'test/api-missing-entries.c',
+                                'test/api-open-multiple-times.c',
+                                'test/api-record-expand.c',
+                                'test/api-simple-delete.c',
+                                'test/api-summary.c']
+    conf.env.NTDB_TEST_API_HELPER_SRC=['test/helpapi-external-agent.c']
+    conf.env.NTDB_TEST_RUN_HELPER_SRC=['test/helprun-external-agent.c',
+                                       'test/helprun-layout.c']
+    conf.env.NTDB_TEST_HELPER_SRC=['test/external-agent.c',
+                                   'test/failtest_helper.c',
+                                   'test/lock-tracking.c',
+                                   'test/logging.c',
+                                   'test/tap-interface.c']
+
+    conf.env.standalone_ntdb = conf.IN_LAUNCH_DIR()
+    conf.env.disable_python = getattr(Options.options, 'disable_python', False)
+
+    if not conf.env.standalone_ntdb:
+        if conf.CHECK_BUNDLED_SYSTEM('ntdb', minversion=VERSION,
+                                         implied_deps='replace'):
+            conf.define('USING_SYSTEM_NTDB', 1)
+            if conf.CHECK_BUNDLED_SYSTEM_PYTHON('pyntdb', 'ntdb', minversion=VERSION):
+                conf.define('USING_SYSTEM_PYNTDB', 1)
+
+    if not conf.env.disable_python:
+        # also disable if we don't have the python libs installed
+        conf.find_program('python', var='PYTHON')
+        conf.check_tool('python')
+        conf.check_python_version((2,4,2))
+        conf.SAMBA_CHECK_PYTHON_HEADERS(mandatory=False)
+        if not conf.env.HAVE_PYTHON_H:
+            Logs.warn('Disabling pyntdb as python devel libs not found')
+            conf.env.disable_python = True
+
+    # This make #include <ccan/...> work.
+    conf.ADD_EXTRA_INCLUDES('''#lib''')
+
+    conf.SAMBA_CONFIG_H()
+
+def build(bld):
+    bld.RECURSE('lib/replace')
+    bld.RECURSE('lib/ccan')
+
+    if bld.env.standalone_ntdb:
+        bld.env.PKGCONFIGDIR = '${LIBDIR}/pkgconfig'
+        private_library = False
+    else:
+        private_library = True
+
+    SRC = '''check.c free.c hash.c io.c lock.c open.c
+                 summary.c ntdb.c transaction.c traverse.c'''
+
+    if not bld.CONFIG_SET('USING_SYSTEM_NTDB'):
+        bld.SAMBA_LIBRARY('ntdb',
+                          SRC,
+                          deps='replace ccan',
+                          includes='.',
+                          abi_directory='ABI',
+                          abi_match='ntdb_*',
+                          hide_symbols=True,
+                          vnum=VERSION,
+                          public_headers='ntdb.h',
+                          public_headers_install=not private_library,
+                          pc_files='ntdb.pc',
+                          private_library=private_library)
+
+        bld.SAMBA_BINARY('ntdbtorture',
+                         'tools/ntdbtorture.c',
+                         deps='ntdb',
+                         install=False)
+
+        bld.SAMBA_BINARY('ntdbtool',
+                         'tools/ntdbtool.c',
+                         deps='ntdb')
+
+        bld.SAMBA_BINARY('ntdbdump',
+                         'tools/ntdbdump.c',
+                         deps='ntdb')
+
+        bld.SAMBA_BINARY('ntdbrestore',
+                         'tools/ntdbrestore.c',
+                         deps='ntdb')
+
+        bld.SAMBA_BINARY('ntdbbackup',
+                         'tools/ntdbbackup.c',
+                         deps='ntdb')
+
+        if bld.env.DEVELOPER_MODE:
+            # FIXME: We need CCAN for some API tests, but waf thinks it's
+            # already available via ntdb.  It is, but not publicly.
+            # Workaround is to build a private, non-hiding version.
+            bld.SAMBA_SUBSYSTEM('ntdb-testing',
+                                SRC,
+                                deps='replace ccan',
+                                includes='.')
+
+            bld.SAMBA_SUBSYSTEM('ntdb-test-helpers',
+                                bld.env.NTDB_TEST_HELPER_SRC,
+                                deps='replace')
+            bld.SAMBA_SUBSYSTEM('ntdb-run-helpers',
+                                bld.env.NTDB_TEST_RUN_HELPER_SRC,
+                                deps='replace')
+            bld.SAMBA_SUBSYSTEM('ntdb-api-helpers',
+                                bld.env.NTDB_TEST_API_HELPER_SRC,
+                                deps='replace ntdb-testing')
+
+            for f in bld.env.NTDB_TEST_RUN_SRC:
+                base = os.path.splitext(os.path.basename(f))[0]
+                bld.SAMBA_BINARY('ntdb-' + base, f,
+                                 deps='ccan replace ntdb-test-helpers ntdb-run-helpers ccan-failtest',
+                                 install=False)
+
+            for f in bld.env.NTDB_TEST_API_SRC:
+                base = os.path.splitext(os.path.basename(f))[0]
+                bld.SAMBA_BINARY('ntdb-' + base, f,
+                                 deps='ccan replace ntdb-test-helpers ntdb-api-helpers',
+                                 install=False)
+
+        if not bld.CONFIG_SET('USING_SYSTEM_PYNTDB'):
+            bld.SAMBA_PYTHON('pyntdb',
+                             source='pyntdb.c',
+                             deps='ntdb',
+                             enabled=not bld.env.disable_python,
+                             realname='ntdb.so',
+                             cflags='-DPACKAGE_VERSION=\"%s\"' % VERSION)
+
+def testonly(ctx):
+    '''run ntdb testsuite'''
+    import Utils, samba_utils, shutil
+    ecode = 0;
+
+    env = samba_utils.LOAD_ENVIRONMENT()
+
+    if env.standalone_ntdb:
+        # FIXME: This is horrible :(
+        test_prefix = "%s/st" % (Utils.g_module.blddir)
+        shutil.rmtree(test_prefix, ignore_errors=True)
+        os.makedirs(test_prefix)
+
+        # Create scratch directory for tests.
+        testdir = os.path.join(test_prefix, 'ntdb-tests')
+        samba_utils.mkdir_p(testdir)
+        # Symlink back to source dir so it can find tests in test/
+        link = os.path.join(testdir, 'test')
+        if not os.path.exists(link):
+            os.symlink(os.path.abspath(os.path.join(env.cwd, 'test')), link)
+
+        if Options.options.VALGRIND:
+            os.environ['VALGRIND'] = 'valgrind -q --num-callers=30'
+        if Options.options.VALGRINDLOG is not None:
+            os.environ['VALGRIND'] += ' --log-file=%s' % Options.options.VALGRINDLOG
+
+        for f in env.NTDB_TEST_RUN_SRC + env.NTDB_TEST_API_SRC:
+            name = "ntdb-" + os.path.splitext(os.path.basename(f))[0]
+            cmd = "cd " + testdir + " && $VALGRIND " + os.path.abspath(os.path.join(Utils.g_module.blddir, name)) + " > test-output 2>&1"
+            print("..." + f)
+            ret = samba_utils.RUN_COMMAND(cmd)
+            if ret != 0:
+                print("%s (%s) failed:" % (name, f))
+                samba_utils.RUN_COMMAND("cat " + os.path.join(testdir, 'test-output'))
+                ecode = ret;
+                break;
+
+    sys.exit(ecode)
+
+# WAF doesn't build the unit tests for this, maybe because they don't link with ntdb?
+# This forces it
+def test(ctx):
+    import Scripting
+    Scripting.commands.append('build')
+    Scripting.commands.append('testonly')
+
+def dist():
+    '''makes a tarball for distribution'''
+    samba_dist.dist()
+
+def reconfigure(ctx):
+    '''reconfigure if config scripts have changed'''
+    import samba_utils
+    samba_utils.reconfigure(ctx)
diff --git a/lib/tdb2/ABI/tdb-2.0.0.sigs b/lib/tdb2/ABI/tdb-2.0.0.sigs
deleted file mode 100644
index 0e54b90895..0000000000
--- a/lib/tdb2/ABI/tdb-2.0.0.sigs
+++ /dev/null
@@ -1,40 +0,0 @@
-tdb1_incompatible_hash: uint64_t (const void *, size_t, uint64_t, void *)
-tdb_add_flag: void (struct tdb_context *, unsigned int)
-tdb_append: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data)
-tdb_chainlock: enum TDB_ERROR (struct tdb_context *, TDB_DATA)
-tdb_chainlock_read: enum TDB_ERROR (struct tdb_context *, TDB_DATA)
-tdb_chainunlock: void (struct tdb_context *, TDB_DATA)
-tdb_chainunlock_read: void (struct tdb_context *, TDB_DATA)
-tdb_check_: enum TDB_ERROR (struct tdb_context *, enum TDB_ERROR (*)(TDB_DATA, TDB_DATA, void *), void *)
-tdb_close: int (struct tdb_context *)
-tdb_delete: enum TDB_ERROR (struct tdb_context *, struct tdb_data)
-tdb_error: enum TDB_ERROR (struct tdb_context *)
-tdb_errorstr: const char *(enum TDB_ERROR)
-tdb_exists: bool (struct tdb_context *, TDB_DATA)
-tdb_fd: int (const struct tdb_context *)
-tdb_fetch: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data *)
-tdb_firstkey: enum TDB_ERROR (struct tdb_context *, struct tdb_data *)
-tdb_foreach_: void (int (*)(struct tdb_context *, void *), void *)
-tdb_get_attribute: enum TDB_ERROR (struct tdb_context *, union tdb_attribute *)
-tdb_get_flags: unsigned int (struct tdb_context *)
-tdb_get_seqnum: int64_t (struct tdb_context *)
-tdb_lockall: enum TDB_ERROR (struct tdb_context *)
-tdb_lockall_read: enum TDB_ERROR (struct tdb_context *)
-tdb_name: const char *(const struct tdb_context *)
-tdb_nextkey: enum TDB_ERROR (struct tdb_context *, struct tdb_data *)
-tdb_open: struct tdb_context *(const char *, int, int, mode_t, union tdb_attribute *)
-tdb_parse_record_: enum TDB_ERROR (struct tdb_context *, TDB_DATA, enum TDB_ERROR (*)(TDB_DATA, TDB_DATA, void *), void *)
-tdb_remove_flag: void (struct tdb_context *, unsigned int)
-tdb_repack: enum TDB_ERROR (struct tdb_context *)
-tdb_set_attribute: enum TDB_ERROR (struct tdb_context *, const union tdb_attribute *)
-tdb_store: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data, int)
-tdb_summary: enum TDB_ERROR (struct tdb_context *, enum tdb_summary_flags, char **)
-tdb_transaction_cancel: void (struct tdb_context *)
-tdb_transaction_commit: enum TDB_ERROR (struct tdb_context *)
-tdb_transaction_prepare_commit: enum TDB_ERROR (struct tdb_context *)
-tdb_transaction_start: enum TDB_ERROR (struct tdb_context *)
-tdb_traverse_: int64_t (struct tdb_context *, int (*)(struct tdb_context *, TDB_DATA, TDB_DATA, void *), void *)
-tdb_unlockall: void (struct tdb_context *)
-tdb_unlockall_read: void (struct tdb_context *)
-tdb_unset_attribute: void (struct tdb_context *, enum tdb_attribute_type)
-tdb_wipe_all: enum TDB_ERROR (struct tdb_context *)
diff --git a/lib/tdb2/ABI/tdb-2.0.1.sigs b/lib/tdb2/ABI/tdb-2.0.1.sigs
deleted file mode 100644
index f9ee55f84a..0000000000
--- a/lib/tdb2/ABI/tdb-2.0.1.sigs
+++ /dev/null
@@ -1,39 +0,0 @@
-tdb_add_flag: void (struct tdb_context *, unsigned int)
-tdb_append: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data)
-tdb_chainlock: enum TDB_ERROR (struct tdb_context *, TDB_DATA)
-tdb_chainlock_read: enum TDB_ERROR (struct tdb_context *, TDB_DATA)
-tdb_chainunlock: void (struct tdb_context *, TDB_DATA)
-tdb_chainunlock_read: void (struct tdb_context *, TDB_DATA)
-tdb_check_: enum TDB_ERROR (struct tdb_context *, enum TDB_ERROR (*)(TDB_DATA, TDB_DATA, void *), void *)
-tdb_close: int (struct tdb_context *)
-tdb_delete: enum TDB_ERROR (struct tdb_context *, struct tdb_data)
-tdb_error: enum TDB_ERROR (struct tdb_context *)
-tdb_errorstr: const char *(enum TDB_ERROR)
-tdb_exists: bool (struct tdb_context *, TDB_DATA)
-tdb_fd: int (const struct tdb_context *)
-tdb_fetch: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data *)
-tdb_firstkey: enum TDB_ERROR (struct tdb_context *, struct tdb_data *)
-tdb_foreach_: void (int (*)(struct tdb_context *, void *), void *)
-tdb_get_attribute: enum TDB_ERROR (struct tdb_context *, union tdb_attribute *)
-tdb_get_flags: unsigned int (struct tdb_context *)
-tdb_get_seqnum: int64_t (struct tdb_context *)
-tdb_lockall: enum TDB_ERROR (struct tdb_context *)
-tdb_lockall_read: enum TDB_ERROR (struct tdb_context *)
-tdb_name: const char *(const struct tdb_context *)
-tdb_nextkey: enum TDB_ERROR (struct tdb_context *, struct tdb_data *)
-tdb_open: struct tdb_context *(const char *, int, int, mode_t, union tdb_attribute *)
-tdb_parse_record_: enum TDB_ERROR (struct tdb_context *, TDB_DATA, enum TDB_ERROR (*)(TDB_DATA, TDB_DATA, void *), void *)
-tdb_remove_flag: void (struct tdb_context *, unsigned int)
-tdb_repack: enum TDB_ERROR (struct tdb_context *)
-tdb_set_attribute: enum TDB_ERROR (struct tdb_context *, const union tdb_attribute *)
-tdb_store: enum TDB_ERROR (struct tdb_context *, struct tdb_data, struct tdb_data, int)
-tdb_summary: enum TDB_ERROR (struct tdb_context *, enum tdb_summary_flags, char **)
-tdb_transaction_cancel: void (struct tdb_context *)
-tdb_transaction_commit: enum TDB_ERROR (struct tdb_context *)
-tdb_transaction_prepare_commit: enum TDB_ERROR (struct tdb_context *)
-tdb_transaction_start: enum TDB_ERROR (struct tdb_context *)
-tdb_traverse_: int64_t (struct tdb_context *, int (*)(struct tdb_context *, TDB_DATA, TDB_DATA, void *), void *)
-tdb_unlockall: void (struct tdb_context *)
-tdb_unlockall_read: void (struct tdb_context *)
-tdb_unset_attribute: void (struct tdb_context *, enum tdb_attribute_type)
-tdb_wipe_all: enum TDB_ERROR (struct tdb_context *)
diff --git a/lib/tdb2/LICENSE b/lib/tdb2/LICENSE
deleted file mode 100644
index cca7fc278f..0000000000
--- a/lib/tdb2/LICENSE
+++ /dev/null
@@ -1,165 +0,0 @@
-		   GNU LESSER GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-
-  This version of the GNU Lesser General Public License incorporates
-the terms and conditions of version 3 of the GNU General Public
-License, supplemented by the additional permissions listed below.
-
-  0. Additional Definitions.
-
-  As used herein, "this License" refers to version 3 of the GNU Lesser
-General Public License, and the "GNU GPL" refers to version 3 of the GNU
-General Public License.
-
-  "The Library" refers to a covered work governed by this License,
-other than an Application or a Combined Work as defined below.
-
-  An "Application" is any work that makes use of an interface provided
-by the Library, but which is not otherwise based on the Library.
-Defining a subclass of a class defined by the Library is deemed a mode
-of using an interface provided by the Library.
-
-  A "Combined Work" is a work produced by combining or linking an
-Application with the Library.  The particular version of the Library
-with which the Combined Work was made is also called the "Linked
-Version".
-
-  The "Minimal Corresponding Source" for a Combined Work means the
-Corresponding Source for the Combined Work, excluding any source code
-for portions of the Combined Work that, considered in isolation, are
-based on the Application, and not on the Linked Version.
-
-  The "Corresponding Application Code" for a Combined Work means the
-object code and/or source code for the Application, including any data
-and utility programs needed for reproducing the Combined Work from the
-Application, but excluding the System Libraries of the Combined Work.
-
-  1. Exception to Section 3 of the GNU GPL.
-
-  You may convey a covered work under sections 3 and 4 of this License
-without being bound by section 3 of the GNU GPL.
-
-  2. Conveying Modified Versions.
-
-  If you modify a copy of the Library, and, in your modifications, a
-facility refers to a function or data to be supplied by an Application
-that uses the facility (other than as an argument passed when the
-facility is invoked), then you may convey a copy of the modified
-version:
-
-   a) under this License, provided that you make a good faith effort to
-   ensure that, in the event an Application does not supply the
-   function or data, the facility still operates, and performs
-   whatever part of its purpose remains meaningful, or
-
-   b) under the GNU GPL, with none of the additional permissions of
-   this License applicable to that copy.
-
-  3. Object Code Incorporating Material from Library Header Files.
-
-  The object code form of an Application may incorporate material from
-a header file that is part of the Library.  You may convey such object
-code under terms of your choice, provided that, if the incorporated
-material is not limited to numerical parameters, data structure
-layouts and accessors, or small macros, inline functions and templates
-(ten or fewer lines in length), you do both of the following:
-
-   a) Give prominent notice with each copy of the object code that the
-   Library is used in it and that the Library and its use are
-   covered by this License.
-
-   b) Accompany the object code with a copy of the GNU GPL and this license
-   document.
-
-  4. Combined Works.
-
-  You may convey a Combined Work under terms of your choice that,
-taken together, effectively do not restrict modification of the
-portions of the Library contained in the Combined Work and reverse
-engineering for debugging such modifications, if you also do each of
-the following:
-
-   a) Give prominent notice with each copy of the Combined Work that
-   the Library is used in it and that the Library and its use are
-   covered by this License.
-
-   b) Accompany the Combined Work with a copy of the GNU GPL and this license
-   document.
-
-   c) For a Combined Work that displays copyright notices during
-   execution, include the copyright notice for the Library among
-   these notices, as well as a reference directing the user to the
-   copies of the GNU GPL and this license document.
-
-   d) Do one of the following:
-
-       0) Convey the Minimal Corresponding Source under the terms of this
-       License, and the Corresponding Application Code in a form
-       suitable for, and under terms that permit, the user to
-       recombine or relink the Application with a modified version of
-       the Linked Version to produce a modified Combined Work, in the
-       manner specified by section 6 of the GNU GPL for conveying
-       Corresponding Source.
-
-       1) Use a suitable shared library mechanism for linking with the
-       Library.  A suitable mechanism is one that (a) uses at run time
-       a copy of the Library already present on the user's computer
-       system, and (b) will operate properly with a modified version
-       of the Library that is interface-compatible with the Linked
-       Version.
-
-   e) Provide Installation Information, but only if you would otherwise
-   be required to provide such information under section 6 of the
-   GNU GPL, and only to the extent that such information is
-   necessary to install and execute a modified version of the
-   Combined Work produced by recombining or relinking the
-   Application with a modified version of the Linked Version. (If
-   you use option 4d0, the Installation Information must accompany
-   the Minimal Corresponding Source and Corresponding Application
-   Code. If you use option 4d1, you must provide the Installation
-   Information in the manner specified by section 6 of the GNU GPL
-   for conveying Corresponding Source.)
-
-  5. Combined Libraries.
-
-  You may place library facilities that are a work based on the
-Library side by side in a single library together with other library
-facilities that are not Applications and are not covered by this
-License, and convey such a combined library under terms of your
-choice, if you do both of the following:
-
-   a) Accompany the combined library with a copy of the same work based
-   on the Library, uncombined with any other library facilities,
-   conveyed under the terms of this License.
-
-   b) Give prominent notice with the combined library that part of it
-   is a work based on the Library, and explaining where to find the
-   accompanying uncombined form of the same work.
-
-  6. Revised Versions of the GNU Lesser General Public License.
-
-  The Free Software Foundation may publish revised and/or new versions
-of the GNU Lesser General Public License from time to time. Such new
-versions will be similar in spirit to the present version, but may
-differ in detail to address new problems or concerns.
-
-  Each version is given a distinguishing version number. If the
-Library as you received it specifies that a certain numbered version
-of the GNU Lesser General Public License "or any later version"
-applies to it, you have the option of following the terms and
-conditions either of that published version or of any later version
-published by the Free Software Foundation. If the Library as you
-received it does not specify a version number of the GNU Lesser
-General Public License, you may choose any version of the GNU Lesser
-General Public License ever published by the Free Software Foundation.
-
-  If the Library as you received it specifies that a proxy can decide
-whether future versions of the GNU Lesser General Public License shall
-apply, that proxy's public statement of acceptance of any version is
-permanent authorization for you to choose that version for the
-Library.
diff --git a/lib/tdb2/Makefile b/lib/tdb2/Makefile
deleted file mode 100644
index ddd439d503..0000000000
--- a/lib/tdb2/Makefile
+++ /dev/null
@@ -1,67 +0,0 @@
-# simple makefile wrapper to run waf
-
-WAF=WAF_MAKE=1 PATH=buildtools/bin:../../buildtools/bin:$$PATH waf
-
-all:
-	$(WAF) build
-
-install:
-	$(WAF) install
-
-uninstall:
-	$(WAF) uninstall
-
-test: FORCE
-	$(WAF) test $(TEST_OPTIONS)
-
-testenv:
-	$(WAF) test --testenv $(TEST_OPTIONS)
-
-quicktest:
-	$(WAF) test --quick $(TEST_OPTIONS)
-
-dist:
-	touch .tmplock
-	WAFLOCK=.tmplock $(WAF) dist
-
-distcheck:
-	touch .tmplock
-	WAFLOCK=.tmplock $(WAF) distcheck
-
-clean:
-	$(WAF) clean
-
-distclean:
-	$(WAF) distclean
-
-reconfigure: configure
-	$(WAF) reconfigure
-
-show_waf_options:
-	$(WAF) --help
-
-# some compatibility make targets
-everything: all
-
-testsuite: all
-
-.PHONY: check
-check: test
-
-torture: all
-
-# this should do an install as well, once install is finished
-installcheck: test
-
-etags:
-	$(WAF) etags
-
-ctags:
-	$(WAF) ctags
-
-pydoctor:
-	$(WAF) pydoctor
-
-bin/%:: FORCE
-	$(WAF) --targets=`basename $@`
-FORCE:
diff --git a/lib/tdb2/TODO b/lib/tdb2/TODO
deleted file mode 100644
index 0a9374f016..0000000000
--- a/lib/tdb2/TODO
+++ /dev/null
@@ -1,4 +0,0 @@
-- tdb2restore, tdb2dump, tdb2backup
-- tdb2tool man page
-- Integrate ccan testsuite
-- Integrate tdb2 testsuite
diff --git a/lib/tdb2/_info b/lib/tdb2/_info
deleted file mode 100644
index 37c0c29e99..0000000000
--- a/lib/tdb2/_info
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <string.h>
-#include <stdio.h>
-
-/**
- * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database
- *
- * The tdb2 module provides an efficient keyword data mapping (usually
- * within a file).  It supports transactions, so the contents of the
- * database is reliable even across crashes.
- *
- * Example:
- *	#include <ccan/tdb2/tdb2.h>
- *	#include <ccan/str/str.h>
- *	#include <ccan/err/err.h>
- *	#include <stdio.h>
- *
- *	static void usage(const char *argv0)
- *	{
- *		errx(1, "Usage: %s fetch <dbfile> <key>\n"
- *		     "OR %s store <dbfile> <key> <data>", argv0, argv0);
- *	}
- *
- *	int main(int argc, char *argv[])
- *	{
- *		struct tdb_context *tdb;
- *		TDB_DATA key, value;
- *		enum TDB_ERROR error;
- *
- *		if (argc < 4)
- *			usage(argv[0]);
- *
- *		tdb = tdb_open(argv[2], TDB_DEFAULT, O_CREAT|O_RDWR,0600, NULL);
- *		if (!tdb)
- *			err(1, "Opening %s", argv[2]);
- *
- *		key.dptr = (void *)argv[3];
- *		key.dsize = strlen(argv[3]);
- *
- *		if (streq(argv[1], "fetch")) {
- *			if (argc != 4)
- *				usage(argv[0]);
- *			error = tdb_fetch(tdb, key, &value);
- *			if (error)
- *				errx(1, "fetch %s: %s",
- *				     argv[3], tdb_errorstr(error));
- *			printf("%.*s\n", value.dsize, (char *)value.dptr);
- *			free(value.dptr);
- *		} else if (streq(argv[1], "store")) {
- *			if (argc != 5)
- *				usage(argv[0]);
- *			value.dptr = (void *)argv[4];
- *			value.dsize = strlen(argv[4]);
- *			error = tdb_store(tdb, key, value, 0);
- *			if (error)
- *				errx(1, "store %s: %s",
- *				     argv[3], tdb_errorstr(error));
- *		} else
- *			usage(argv[0]);
- *
- *		return 0;
- *	}
- *
- * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
- *
- * Author: Rusty Russell
- *
- * License: LGPLv3 (or later)
- */
-int main(int argc, char *argv[])
-{
-	if (argc != 2)
-		return 1;
-
-	if (strcmp(argv[1], "depends") == 0) {
-		printf("ccan/asprintf\n");
-		printf("ccan/hash\n");
-		printf("ccan/likely\n");
-		printf("ccan/asearch\n");
-		printf("ccan/compiler\n");
-		printf("ccan/build_assert\n");
-		printf("ccan/ilog\n");
-		printf("ccan/failtest\n");
-		printf("ccan/tally\n");
-		printf("ccan/typesafe_cb\n");
-		printf("ccan/cast\n");
-		printf("ccan/endian\n");
-		return 0;
-	}
-
-	return 1;
-}
diff --git a/lib/tdb2/check.c b/lib/tdb2/check.c
deleted file mode 100644
index 4b589b6ee1..0000000000
--- a/lib/tdb2/check.c
+++ /dev/null
@@ -1,864 +0,0 @@
- /*
-   Trivial Database 2: free list/block handling
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-#include <ccan/asearch/asearch.h>
-
-/* We keep an ordered array of offsets. */
-static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
-{
-	tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
-	if (!new)
-		return false;
-	new[(*num)++] = off;
-	*arr = new;
-	return true;
-}
-
-static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery,
-				   uint64_t *features, size_t *num_capabilities)
-{
-	uint64_t hash_test;
-	struct tdb_header hdr;
-	enum TDB_ERROR ecode;
-	tdb_off_t off, next;
-
-	ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-	/* magic food should not be converted, so convert back. */
-	tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
-
-	hash_test = TDB_HASH_MAGIC;
-	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
-	if (hdr.hash_test != hash_test) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "check: hash test %llu should be %llu",
-				  (long long)hdr.hash_test,
-				  (long long)hash_test);
-	}
-
-	if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "check: bad magic '%.*s'",
-				  (unsigned)sizeof(hdr.magic_food),
-				  hdr.magic_food);
-	}
-
-	/* Features which are used must be a subset of features offered. */
-	if (hdr.features_used & ~hdr.features_offered) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "check: features used (0x%llx) which"
-				  " are not offered (0x%llx)",
-				  (long long)hdr.features_used,
-				  (long long)hdr.features_offered);
-	}
-
-	*features = hdr.features_offered;
-	*recovery = hdr.recovery;
-	if (*recovery) {
-		if (*recovery < sizeof(hdr)
-		    || *recovery > tdb->file->map_size) {
-			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-					  "tdb_check:"
-					  " invalid recovery offset %zu",
-					  (size_t)*recovery);
-		}
-	}
-
-	for (off = hdr.capabilities; off && ecode == TDB_SUCCESS; off = next) {
-		const struct tdb_capability *cap;
-		enum TDB_ERROR e;
-
-		cap = tdb_access_read(tdb, off, sizeof(*cap), true);
-		if (TDB_PTR_IS_ERR(cap)) {
-			return TDB_PTR_ERR(cap);
-		}
-
-		/* All capabilities are unknown. */
-		e = unknown_capability(tdb, "tdb_check", cap->type);
-		next = cap->next;
-		tdb_access_release(tdb, cap);
-		if (e)
-			return e;
-		(*num_capabilities)++;
-	}
-
-	/* Don't check reserved: they *can* be used later. */
-	return TDB_SUCCESS;
-}
-
-static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
-				      tdb_off_t off, unsigned int group_bits,
-				      uint64_t hprefix,
-				      unsigned hprefix_bits,
-				      tdb_off_t used[],
-				      size_t num_used,
-				      size_t *num_found,
-				      enum TDB_ERROR (*check)(TDB_DATA,
-							      TDB_DATA, void *),
-				      void *data);
-
-static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
-				       tdb_off_t off,
-				       uint64_t hash,
-				       tdb_off_t used[],
-				       size_t num_used,
-				       size_t *num_found,
-				       enum TDB_ERROR (*check)(TDB_DATA,
-							       TDB_DATA,
-							       void *),
-				       void *data)
-{
-	struct tdb_used_record rec;
-	enum TDB_ERROR ecode;
-
-	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: Bad hash chain magic %llu",
-				  (long long)rec_magic(&rec));
-	}
-
-	if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check:"
-				  " Bad hash chain length %llu vs %zu",
-				  (long long)rec_data_length(&rec),
-				  sizeof(struct tdb_chain));
-	}
-	if (rec_key_length(&rec) != 0) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: Bad hash chain key length %llu",
-				  (long long)rec_key_length(&rec));
-	}
-	if (rec_hash(&rec) != 0) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: Bad hash chain hash value %llu",
-				  (long long)rec_hash(&rec));
-	}
-
-	off += sizeof(rec);
-	ecode = check_hash_tree(tdb, off, 0, hash, 64,
-				used, num_used, num_found, check, data);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
-	if (TDB_OFF_IS_ERR(off)) {
-		return TDB_OFF_TO_ERR(off);
-	}
-	if (off == 0)
-		return TDB_SUCCESS;
-	(*num_found)++;
-	return check_hash_chain(tdb, off, hash, used, num_used, num_found,
-				check, data);
-}
-
-static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
-					tdb_off_t off,
-					uint64_t hprefix,
-					unsigned hprefix_bits,
-					tdb_off_t used[],
-					size_t num_used,
-					size_t *num_found,
-					enum TDB_ERROR (*check)(TDB_DATA,
-								TDB_DATA,
-								void *),
-					void *data)
-{
-	struct tdb_used_record rec;
-	enum TDB_ERROR ecode;
-
-	if (hprefix_bits >= 64)
-		return check_hash_chain(tdb, off, hprefix, used, num_used,
-					num_found, check, data);
-
-	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: Bad hash table magic %llu",
-				  (long long)rec_magic(&rec));
-	}
-	if (rec_data_length(&rec)
-	    != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check:"
-				  " Bad hash table length %llu vs %llu",
-				  (long long)rec_data_length(&rec),
-				  (long long)sizeof(tdb_off_t)
-				  << TDB_SUBLEVEL_HASH_BITS);
-	}
-	if (rec_key_length(&rec) != 0) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: Bad hash table key length %llu",
-				  (long long)rec_key_length(&rec));
-	}
-	if (rec_hash(&rec) != 0) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: Bad hash table hash value %llu",
-				  (long long)rec_hash(&rec));
-	}
-
-	off += sizeof(rec);
-	return check_hash_tree(tdb, off,
-			       TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
-			       hprefix, hprefix_bits,
-			       used, num_used, num_found, check, data);
-}
-
-static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
-{
-	/* Can overflow an int. */
-	return *a > *b ? 1
-		: *a < *b ? -1
-		: 0;
-}
-
-static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
-{
-	*used += num;
-
-	return (h >> (64 - *used)) & ((1U << num) - 1);
-}
-
-static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
-				      tdb_off_t off, unsigned int group_bits,
-				      uint64_t hprefix,
-				      unsigned hprefix_bits,
-				      tdb_off_t used[],
-				      size_t num_used,
-				      size_t *num_found,
-				      enum TDB_ERROR (*check)(TDB_DATA,
-							      TDB_DATA, void *),
-				      void *data)
-{
-	unsigned int g, b;
-	const tdb_off_t *hash;
-	struct tdb_used_record rec;
-	enum TDB_ERROR ecode;
-
-	hash = tdb_access_read(tdb, off,
-			       sizeof(tdb_off_t)
-			       << (group_bits + TDB_HASH_GROUP_BITS),
-			       true);
-	if (TDB_PTR_IS_ERR(hash)) {
-		return TDB_PTR_ERR(hash);
-	}
-
-	for (g = 0; g < (1 << group_bits); g++) {
-		const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
-		for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
-			unsigned int bucket, i, used_bits;
-			uint64_t h;
-			tdb_off_t *p;
-			if (group[b] == 0)
-				continue;
-
-			off = group[b] & TDB_OFF_MASK;
-			p = asearch(&off, used, num_used, off_cmp);
-			if (!p) {
-				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						   TDB_LOG_ERROR,
-						   "tdb_check: Invalid offset"
-						   " %llu in hash",
-						   (long long)off);
-				goto fail;
-			}
-			/* Mark it invalid. */
-			*p ^= 1;
-			(*num_found)++;
-
-			if (hprefix_bits == 64) {
-				/* Chained entries are unordered. */
-				if (is_subhash(group[b])) {
-					ecode = TDB_ERR_CORRUPT;
-					tdb_logerr(tdb, ecode,
-						   TDB_LOG_ERROR,
-						   "tdb_check: Invalid chain"
-						   " entry subhash");
-					goto fail;
-				}
-				h = hash_record(tdb, off);
-				if (h != hprefix) {
-					ecode = TDB_ERR_CORRUPT;
-					tdb_logerr(tdb, ecode,
-						   TDB_LOG_ERROR,
-						   "check: bad hash chain"
-						   " placement"
-						   " 0x%llx vs 0x%llx",
-						   (long long)h,
-						   (long long)hprefix);
-					goto fail;
-				}
-				ecode = tdb_read_convert(tdb, off, &rec,
-							 sizeof(rec));
-				if (ecode != TDB_SUCCESS) {
-					goto fail;
-				}
-				goto check;
-			}
-
-			if (is_subhash(group[b])) {
-				uint64_t subprefix;
-				subprefix = (hprefix
-				     << (group_bits + TDB_HASH_GROUP_BITS))
-					+ g * (1 << TDB_HASH_GROUP_BITS) + b;
-
-				ecode = check_hash_record(tdb,
-					       group[b] & TDB_OFF_MASK,
-					       subprefix,
-					       hprefix_bits
-						       + group_bits
-						       + TDB_HASH_GROUP_BITS,
-					       used, num_used, num_found,
-					       check, data);
-				if (ecode != TDB_SUCCESS) {
-					goto fail;
-				}
-				continue;
-			}
-			/* A normal entry */
-
-			/* Does it belong here at all? */
-			h = hash_record(tdb, off);
-			used_bits = 0;
-			if (get_bits(h, hprefix_bits, &used_bits) != hprefix
-			    && hprefix_bits) {
-				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						   TDB_LOG_ERROR,
-						   "check: bad hash placement"
-						   " 0x%llx vs 0x%llx",
-						   (long long)h,
-						   (long long)hprefix);
-				goto fail;
-			}
-
-			/* Does it belong in this group? */
-			if (get_bits(h, group_bits, &used_bits) != g) {
-				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						   TDB_LOG_ERROR,
-						   "check: bad group %llu"
-						   " vs %u",
-						   (long long)h, g);
-				goto fail;
-			}
-
-			/* Are bucket bits correct? */
-			bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
-			if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
-			    != bucket) {
-				used_bits -= TDB_HASH_GROUP_BITS;
-				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						   TDB_LOG_ERROR,
-						   "check: bad bucket %u vs %u",
-						   (unsigned)get_bits(h,
-							TDB_HASH_GROUP_BITS,
-							&used_bits),
-						   bucket);
-				goto fail;
-			}
-
-			/* There must not be any zero entries between
-			 * the bucket it belongs in and this one! */
-			for (i = bucket;
-			     i != b;
-			     i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
-				if (group[i] == 0) {
-					ecode = TDB_ERR_CORRUPT;
-					tdb_logerr(tdb, ecode,
-						   TDB_LOG_ERROR,
-						   "check: bad group placement"
-						   " %u vs %u",
-						   b, bucket);
-					goto fail;
-				}
-			}
-
-			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
-			if (ecode != TDB_SUCCESS) {
-				goto fail;
-			}
-
-			/* Bottom bits must match header. */
-			if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
-				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						   TDB_LOG_ERROR,
-						   "tdb_check: Bad hash magic"
-						   " at offset %llu"
-						   " (0x%llx vs 0x%llx)",
-						   (long long)off,
-						   (long long)h,
-						   (long long)rec_hash(&rec));
-				goto fail;
-			}
-
-		check:
-			if (check) {
-				TDB_DATA k, d;
-				const unsigned char *kptr;
-
-				kptr = tdb_access_read(tdb,
-						       off + sizeof(rec),
-						       rec_key_length(&rec)
-						       + rec_data_length(&rec),
-						       false);
-				if (TDB_PTR_IS_ERR(kptr)) {
-					ecode = TDB_PTR_ERR(kptr);
-					goto fail;
-				}
-
-				k = tdb_mkdata(kptr, rec_key_length(&rec));
-				d = tdb_mkdata(kptr + k.dsize,
-					       rec_data_length(&rec));
-				ecode = check(k, d, data);
-				tdb_access_release(tdb, kptr);
-				if (ecode != TDB_SUCCESS) {
-					goto fail;
-				}
-			}
-		}
-	}
-	tdb_access_release(tdb, hash);
-	return TDB_SUCCESS;
-
-fail:
-	tdb_access_release(tdb, hash);
-	return ecode;
-}
-
-static enum TDB_ERROR check_hash(struct tdb_context *tdb,
-				 tdb_off_t used[],
-				 size_t num_used, size_t num_other_used,
-				 enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
-				 void *data)
-{
-	/* Free tables and capabilities also show up as used. */
-	size_t num_found = num_other_used;
-	enum TDB_ERROR ecode;
-
-	ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
-				TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
-				0, 0, used, num_used, &num_found,
-				check, data);
-	if (ecode == TDB_SUCCESS) {
-		if (num_found != num_used) {
-			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-					   "tdb_check: Not all entries"
-					   " are in hash");
-		}
-	}
-	return ecode;
-}
-
-static enum TDB_ERROR check_free(struct tdb_context *tdb,
-				 tdb_off_t off,
-				 const struct tdb_free_record *frec,
-				 tdb_off_t prev, unsigned int ftable,
-				 unsigned int bucket)
-{
-	enum TDB_ERROR ecode;
-
-	if (frec_magic(frec) != TDB_FREE_MAGIC) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: offset %llu bad magic 0x%llx",
-				  (long long)off,
-				  (long long)frec->magic_and_prev);
-	}
-	if (frec_ftable(frec) != ftable) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: offset %llu bad freetable %u",
-				  (long long)off, frec_ftable(frec));
-
-	}
-
-	ecode = tdb->io->oob(tdb, off,
-			     frec_len(frec)
-			     + sizeof(struct tdb_used_record),
-			     false);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-	if (size_to_bucket(frec_len(frec)) != bucket) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: offset %llu in wrong bucket"
-				  " (%u vs %u)",
-				  (long long)off,
-				  bucket, size_to_bucket(frec_len(frec)));
-	}
-	if (prev && prev != frec_prev(frec)) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: offset %llu bad prev"
-				  " (%llu vs %llu)",
-				  (long long)off,
-				  (long long)prev, (long long)frec_len(frec));
-	}
-	return TDB_SUCCESS;
-}
-
-static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
-				       tdb_off_t ftable_off,
-				       unsigned ftable_num,
-				       tdb_off_t fr[],
-				       size_t num_free,
-				       size_t *num_found)
-{
-	struct tdb_freetable ft;
-	tdb_off_t h;
-	unsigned int i;
-	enum TDB_ERROR ecode;
-
-	ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
-	    || rec_key_length(&ft.hdr) != 0
-	    || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
-	    || rec_hash(&ft.hdr) != 0) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: Invalid header on free table");
-	}
-
-	for (i = 0; i < TDB_FREE_BUCKETS; i++) {
-		tdb_off_t off, prev = 0, *p, first = 0;
-		struct tdb_free_record f;
-
-		h = bucket_off(ftable_off, i);
-		for (off = tdb_read_off(tdb, h); off; off = f.next) {
-			if (TDB_OFF_IS_ERR(off)) {
-				return TDB_OFF_TO_ERR(off);
-			}
-			if (!first) {
-				off &= TDB_OFF_MASK;
-				first = off;
-			}
-			ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-			ecode = check_free(tdb, off, &f, prev, ftable_num, i);
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-
-			/* FIXME: Check hash bits */
-			p = asearch(&off, fr, num_free, off_cmp);
-			if (!p) {
-				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						  TDB_LOG_ERROR,
-						  "tdb_check: Invalid offset"
-						  " %llu in free table",
-						  (long long)off);
-			}
-			/* Mark it invalid. */
-			*p ^= 1;
-			(*num_found)++;
-			prev = off;
-		}
-
-		if (first) {
-			/* Now we can check first back pointer. */
-			ecode = tdb_read_convert(tdb, first, &f, sizeof(f));
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-			ecode = check_free(tdb, first, &f, prev, ftable_num, i);
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-		}
-	}
-	return TDB_SUCCESS;
-}
-
-/* Slow, but should be very rare. */
-tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
-{
-	size_t len;
-	enum TDB_ERROR ecode;
-
-	for (len = 0; off + len < tdb->file->map_size; len++) {
-		char c;
-		ecode = tdb->io->tread(tdb, off, &c, 1);
-		if (ecode != TDB_SUCCESS) {
-			return TDB_ERR_TO_OFF(ecode);
-		}
-		if (c != 0 && c != 0x43)
-			break;
-	}
-	return len;
-}
-
-static enum TDB_ERROR check_linear(struct tdb_context *tdb,
-				   tdb_off_t **used, size_t *num_used,
-				   tdb_off_t **fr, size_t *num_free,
-				   uint64_t features, tdb_off_t recovery)
-{
-	tdb_off_t off;
-	tdb_len_t len;
-	enum TDB_ERROR ecode;
-	bool found_recovery = false;
-
-	for (off = sizeof(struct tdb_header);
-	     off < tdb->file->map_size;
-	     off += len) {
-		union {
-			struct tdb_used_record u;
-			struct tdb_free_record f;
-			struct tdb_recovery_record r;
-		} rec;
-		/* r is larger: only get that if we need to. */
-		ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-
-		/* If we crash after ftruncate, we can get zeroes or fill. */
-		if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
-		    || rec.r.magic ==  0x4343434343434343ULL) {
-			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-			if (recovery == off) {
-				found_recovery = true;
-				len = sizeof(rec.r) + rec.r.max_len;
-			} else {
-				len = dead_space(tdb, off);
-				if (TDB_OFF_IS_ERR(len)) {
-					return TDB_OFF_TO_ERR(len);
-				}
-				if (len < sizeof(rec.r)) {
-					return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-							  TDB_LOG_ERROR,
-							  "tdb_check: invalid"
-							  " dead space at %zu",
-							  (size_t)off);
-				}
-
-				tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
-					   "Dead space at %zu-%zu (of %zu)",
-					   (size_t)off, (size_t)(off + len),
-					   (size_t)tdb->file->map_size);
-			}
-		} else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
-			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-			if (recovery != off) {
-				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						  TDB_LOG_ERROR,
-						  "tdb_check: unexpected"
-						  " recovery record at offset"
-						  " %zu",
-						  (size_t)off);
-			}
-			if (rec.r.len > rec.r.max_len) {
-				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						  TDB_LOG_ERROR,
-						  "tdb_check: invalid recovery"
-						  " length %zu",
-						  (size_t)rec.r.len);
-			}
-			if (rec.r.eof > tdb->file->map_size) {
-				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						  TDB_LOG_ERROR,
-						  "tdb_check: invalid old EOF"
-						  " %zu", (size_t)rec.r.eof);
-			}
-			found_recovery = true;
-			len = sizeof(rec.r) + rec.r.max_len;
-		} else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
-			len = sizeof(rec.u) + frec_len(&rec.f);
-			if (off + len > tdb->file->map_size) {
-				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						  TDB_LOG_ERROR,
-						  "tdb_check: free overlength"
-						  " %llu at offset %llu",
-						  (long long)len,
-						  (long long)off);
-			}
-			/* This record should be in free lists. */
-			if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
-			    && !append(fr, num_free, off)) {
-				return tdb_logerr(tdb, TDB_ERR_OOM,
-						  TDB_LOG_ERROR,
-						  "tdb_check: tracking %zu'th"
-						  " free record.", *num_free);
-			}
-		} else if (rec_magic(&rec.u) == TDB_USED_MAGIC
-			   || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
-			   || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
-			   || rec_magic(&rec.u) == TDB_FTABLE_MAGIC
-			   || rec_magic(&rec.u) == TDB_CAP_MAGIC) {
-			uint64_t klen, dlen, extra;
-
-			/* This record is used! */
-			if (!append(used, num_used, off)) {
-				return tdb_logerr(tdb, TDB_ERR_OOM,
-						  TDB_LOG_ERROR,
-						  "tdb_check: tracking %zu'th"
-						  " used record.", *num_used);
-			}
-
-			klen = rec_key_length(&rec.u);
-			dlen = rec_data_length(&rec.u);
-			extra = rec_extra_padding(&rec.u);
-
-			len = sizeof(rec.u) + klen + dlen + extra;
-			if (off + len > tdb->file->map_size) {
-				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						  TDB_LOG_ERROR,
-						  "tdb_check: used overlength"
-						  " %llu at offset %llu",
-						  (long long)len,
-						  (long long)off);
-			}
-
-			if (len < sizeof(rec.f)) {
-				return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						  TDB_LOG_ERROR,
-						  "tdb_check: too short record"
-						  " %llu at %llu",
-						  (long long)len,
-						  (long long)off);
-			}
-
-			/* Check that records have correct 0 at end (but may
-			 * not in future). */
-			if (extra && !features
-			    && rec_magic(&rec.u) != TDB_CAP_MAGIC) {
-				const char *p;
-				char c;
-				p = tdb_access_read(tdb, off + sizeof(rec.u)
-						    + klen + dlen, 1, false);
-				if (TDB_PTR_IS_ERR(p))
-					return TDB_PTR_ERR(p);
-				c = *p;
-				tdb_access_release(tdb, p);
-
-				if (c != '\0') {
-					return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-							  TDB_LOG_ERROR,
-							  "tdb_check:"
-							  " non-zero extra"
-							  " at %llu",
-							  (long long)off);
-				}
-			}
-		} else {
-			return tdb_logerr(tdb, TDB_ERR_CORRUPT,
-					  TDB_LOG_ERROR,
-					  "tdb_check: Bad magic 0x%llx"
-					  " at offset %zu",
-					  (long long)rec_magic(&rec.u),
-					  (size_t)off);
-		}
-	}
-
-	/* We must have found recovery area if there was one. */
-	if (recovery != 0 && !found_recovery) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_check: expected a recovery area at %zu",
-				  (size_t)recovery);
-	}
-
-	return TDB_SUCCESS;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
-			  enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
-			  void *data)
-{
-	tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
-	size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
-		num_capabilities = 0;
-	uint64_t features;
-	enum TDB_ERROR ecode;
-
-	if (tdb->flags & TDB_CANT_CHECK) {
-		return tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
-				  "tdb_check: database has unknown capability,"
-				  " cannot check.");
-	}
-
-	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
-	if (ecode != TDB_SUCCESS) {
-		return tdb->last_error = ecode;
-	}
-
-	ecode = tdb_lock_expand(tdb, F_RDLCK);
-	if (ecode != TDB_SUCCESS) {
-		tdb_allrecord_unlock(tdb, F_RDLCK);
-		return tdb->last_error = ecode;
-	}
-
-	ecode = check_header(tdb, &recovery, &features, &num_capabilities);
-	if (ecode != TDB_SUCCESS)
-		goto out;
-
-	/* First we do a linear scan, checking all records. */
-	ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features,
-			     recovery);
-	if (ecode != TDB_SUCCESS)
-		goto out;
-
-	for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
-		if (TDB_OFF_IS_ERR(ft)) {
-			ecode = TDB_OFF_TO_ERR(ft);
-			goto out;
-		}
-		ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
-					 &num_found);
-		if (ecode != TDB_SUCCESS)
-			goto out;
-		num_ftables++;
-	}
-
-	/* FIXME: Check key uniqueness? */
-	ecode = check_hash(tdb, used, num_used, num_ftables + num_capabilities,
-			   check, data);
-	if (ecode != TDB_SUCCESS)
-		goto out;
-
-	if (num_found != num_free) {
-		ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				   "tdb_check: Not all entries are in"
-				   " free table");
-	}
-
-out:
-	tdb_allrecord_unlock(tdb, F_RDLCK);
-	tdb_unlock_expand(tdb, F_RDLCK);
-	free(fr);
-	free(used);
-	return tdb->last_error = ecode;
-}
diff --git a/lib/tdb2/configure b/lib/tdb2/configure
deleted file mode 100755
index 6a9f875511..0000000000
--- a/lib/tdb2/configure
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/sh
-
-PREVPATH=`dirname $0`
-
-if [ -f $PREVPATH/../../buildtools/bin/waf ]; then
-	WAF=../../buildtools/bin/waf
-elif [ -f $PREVPATH/buildtools/bin/waf ]; then
-	WAF=./buildtools/bin/waf
-else
-	echo "replace: Unable to find waf"
-	exit 1
-fi
-
-# using JOBS=1 gives maximum compatibility with
-# systems like AIX which have broken threading in python
-JOBS=1
-export JOBS
-
-cd . || exit 1
-$WAF configure "$@" || exit 1
-cd $PREVPATH
diff --git a/lib/tdb2/doc/TDB1_porting.txt b/lib/tdb2/doc/TDB1_porting.txt
deleted file mode 100644
index e59295c22f..0000000000
--- a/lib/tdb2/doc/TDB1_porting.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-Interface differences between TDB1 and TDB2.
-
-- tdb2 uses 'struct tdb_data', tdb1 uses 'struct TDB_DATA'.  Use the
-  TDB_DATA typedef if you want portability between the two.
-
-- tdb2 functions return 0 on success, and a negative error on failure,
-  whereas tdb1 functions returned 0 on success, and -1 on failure.
-  tdb1 then used tdb_error() to determine the error; this is also
-  supported in tdb2 to ease backwards compatibility, though the other
-  form is preferred.
-
-- tdb2's tdb_fetch() returns an error, tdb1's returned the data directly
-  (or tdb_null, and you were supposed to check tdb_error() to find out why).
-
-- tdb2's tdb_nextkey() frees the old key's dptr, in tdb1 you needed to do
-  this manually.
-
-- tdb1's tdb_open/tdb_open_ex took an explicit hash size.  tdb2's hash table
-  resizes as required.
-
-- tdb2 uses a linked list of attribute structures to implement logging and
-  alternate hashes.  tdb1 used tdb_open_ex, which was not extensible.
-
-- tdb2 does locking on read-only databases (ie. O_RDONLY passed to tdb_open).
-  tdb1 did not: use the TDB_NOLOCK flag if you want to suppress locking.
-
-- tdb2's log function is simpler than tdb1's log function.  The string is
-  already formatted, and it takes an enum tdb_log_level not a tdb_debug_level,
-  and which has only three values: TDB_LOG_ERROR, TDB_LOG_USE_ERROR and
-  TDB_LOG_WARNING.
-
-- tdb2 provides tdb_deq() for comparing two struct tdb_data.
-
-- tdb2's tdb_name() returns a copy of the name even for TDB_INTERNAL dbs.
-
-- tdb2 does not need tdb_reopen() or tdb_reopen_all().  If you call
-  fork() after during certain operations the child should close the
-  tdb, or complete the operations before continuing to use the tdb:
-
-	tdb_transaction_start(): child must tdb_transaction_cancel()
-	tdb_lockall(): child must call tdb_unlockall()
-	tdb_lockall_read(): child must call tdb_unlockall_read()
-	tdb_chainlock(): child must call tdb_chainunlock()
-	tdb_parse() callback: child must return from tdb_parse()
-
-- tdb2 will not open a non-tdb file, even if O_CREAT is specified.
-
-- There is no tdb_traverse_read.  For operating on TDB1 files, you can
-  simulate it by tdb_add_flag(tdb, TDB_RDONLY); tdb_traverse();
-  tdb_remove_flag(tdb, TDB_RDONLY).  This may be desirable because
-  traverse on TDB1 files use a write lock on the entire database
-  unless it's read-only.
-
-- Failure inside a transaction (such as a lock function failing) does
-  not implicitly cancel the transaction; you still need to call
-  tdb_transaction_cancel().
-
-TDB1 Compatibility:
-
-- tdb2's offers a tdb1_incompatible_hash function, which is the same
-  as the default hash with the TDB_INCOMPATIBLE_HASH flag.  There is
-  no way of marking an old TDB incompatible with versions < 1.2.6
-  while using any other hash.
-
-- The TDB_ATTRIBUTE_TDB1_HASHSIZE attribute can be used to control the
-  hash size, but only when creating (ie. O_CREAT) a TDB1
-  (ie. TDB_VERSION1).
-
-- There is no TDB_CLEAR_IF_FIRST flag; it has severe scalability and
-  API problems.  If necessary, you can emulate this by using the open
-  hook and placing a 1-byte lock at offset 4.  If your program forks,
-  you will need to place this lock again in the child.
diff --git a/lib/tdb2/doc/design-1.3.txt b/lib/tdb2/doc/design-1.3.txt
deleted file mode 100644
index f81ecf7885..0000000000
--- a/lib/tdb2/doc/design-1.3.txt
+++ /dev/null
@@ -1,1049 +0,0 @@
-TDB2: A Redesigning The Trivial DataBase
-
-Rusty Russell, IBM Corporation
-
-27-April-2010
-
-Abstract
-
-The Trivial DataBase on-disk format is 32 bits; with usage cases
-heading towards the 4G limit, that must change. This required
-breakage provides an opportunity to revisit TDB's other design
-decisions and reassess them.
-
-1 Introduction
-
-The Trivial DataBase was originally written by Andrew Tridgell as
-a simple key/data pair storage system with the same API as dbm,
-but allowing multiple readers and writers while being small
-enough (< 1000 lines of C) to include in SAMBA. The simple design
-created in 1999 has proven surprisingly robust and performant,
-used in Samba versions 3 and 4 as well as numerous other
-projects. Its useful life was greatly increased by the
-(backwards-compatible!) addition of transaction support in 2005.
-
-The wider variety and greater demands of TDB-using code has lead
-to some organic growth of the API, as well as some compromises on
-the implementation. None of these, by themselves, are seen as
-show-stoppers, but the cumulative effect is to a loss of elegance
-over the initial, simple TDB implementation. Here is a table of
-the approximate number of lines of implementation code and number
-of API functions at the end of each year:
-
-
-+-----------+----------------+--------------------------------+
-| Year End  | API Functions  | Lines of C Code Implementation |
-+-----------+----------------+--------------------------------+
-+-----------+----------------+--------------------------------+
-|   1999    |      13        |              1195              |
-+-----------+----------------+--------------------------------+
-|   2000    |      24        |              1725              |
-+-----------+----------------+--------------------------------+
-|   2001    |      32        |              2228              |
-+-----------+----------------+--------------------------------+
-|   2002    |      35        |              2481              |
-+-----------+----------------+--------------------------------+
-|   2003    |      35        |              2552              |
-+-----------+----------------+--------------------------------+
-|   2004    |      40        |              2584              |
-+-----------+----------------+--------------------------------+
-|   2005    |      38        |              2647              |
-+-----------+----------------+--------------------------------+
-|   2006    |      52        |              3754              |
-+-----------+----------------+--------------------------------+
-|   2007    |      66        |              4398              |
-+-----------+----------------+--------------------------------+
-|   2008    |      71        |              4768              |
-+-----------+----------------+--------------------------------+
-|   2009    |      73        |              5715              |
-+-----------+----------------+--------------------------------+
-
-
-This review is an attempt to catalog and address all the known
-issues with TDB and create solutions which address the problems
-without significantly increasing complexity; all involved are far
-too aware of the dangers of second system syndrome in rewriting a
-successful project like this.
-
-2 API Issues
-
-2.1 tdb_open_ex Is Not Expandable
-
-The tdb_open() call was expanded to tdb_open_ex(), which added an
-optional hashing function and an optional logging function
-argument. Additional arguments to open would require the
-introduction of a tdb_open_ex2 call etc.
-
-2.1.1 Proposed Solution
-
-tdb_open() will take a linked-list of attributes:
-
-enum tdb_attribute {
-
-    TDB_ATTRIBUTE_LOG = 0,
-
-    TDB_ATTRIBUTE_HASH = 1
-
-};
-
-struct tdb_attribute_base {
-
-    enum tdb_attribute attr;
-
-    union tdb_attribute *next;
-
-};
-
-struct tdb_attribute_log {
-
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
-*/
-
-    tdb_log_func log_fn;
-
-    void *log_private;
-
-};
-
-struct tdb_attribute_hash {
-
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
-*/
-
-    tdb_hash_func hash_fn;
-
-    void *hash_private;
-
-};
-
-union tdb_attribute {
-
-    struct tdb_attribute_base base;
-
-    struct tdb_attribute_log log;
-
-    struct tdb_attribute_hash hash;
-
-};
-
-This allows future attributes to be added, even if this expands
-the size of the union.
-
-2.2 tdb_traverse Makes Impossible Guarantees
-
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
-and it was thought that it was important to guarantee that all
-records which exist at the start and end of the traversal would
-be included, and no record would be included twice.
-
-This adds complexity (see[Reliable-Traversal-Adds]) and does not
-work anyway for records which are altered (in particular, those
-which are expanded may be effectively deleted and re-added behind
-the traversal).
-
-2.2.1 <traverse-Proposed-Solution>Proposed Solution
-
-Abandon the guarantee. You will see every record if no changes
-occur during your traversal, otherwise you will see some subset.
-You can prevent changes by using a transaction or the locking
-API.
-
-2.3 Nesting of Transactions Is Fraught
-
-TDB has alternated between allowing nested transactions and not
-allowing them. Various paths in the Samba codebase assume that
-transactions will nest, and in a sense they can: the operation is
-only committed to disk when the outer transaction is committed.
-There are two problems, however:
-
-1. Canceling the inner transaction will cause the outer
-  transaction commit to fail, and will not undo any operations
-  since the inner transaction began. This problem is soluble with
-  some additional internal code.
-
-2. An inner transaction commit can be cancelled by the outer
-  transaction. This is desirable in the way which Samba's
-  database initialization code uses transactions, but could be a
-  surprise to any users expecting a successful transaction commit
-  to expose changes to others.
-
-The current solution is to specify the behavior at tdb_open(),
-with the default currently that nested transactions are allowed.
-This flag can also be changed at runtime.
-
-2.3.1 Proposed Solution
-
-Given the usage patterns, it seems that the “least-surprise”
-behavior of disallowing nested transactions should become the
-default. Additionally, it seems the outer transaction is the only
-code which knows whether inner transactions should be allowed, so
-a flag to indicate this could be added to tdb_transaction_start.
-However, this behavior can be simulated with a wrapper which uses
-tdb_add_flags() and tdb_remove_flags(), so the API should not be
-expanded for this relatively-obscure case.
-
-2.4 Incorrect Hash Function is Not Detected
-
-tdb_open_ex() allows the calling code to specify a different hash
-function to use, but does not check that all other processes
-accessing this tdb are using the same hash function. The result
-is that records are missing from tdb_fetch().
-
-2.4.1 Proposed Solution
-
-The header should contain an example hash result (eg. the hash of
-0xdeadbeef), and tdb_open_ex() should check that the given hash
-function produces the same answer, or fail the tdb_open call.
-
-2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-
-In response to scalability issues with the free list ([TDB-Freelist-Is]
-) two API workarounds have been incorporated in TDB:
-tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
-latter actually calls the former with an argument of “5”.
-
-This code allows deleted records to accumulate without putting
-them in the free list. On delete we iterate through each chain
-and free them in a batch if there are more than max_dead entries.
-These are never otherwise recycled except as a side-effect of a
-tdb_repack.
-
-2.5.1 Proposed Solution
-
-With the scalability problems of the freelist solved, this API
-can be removed. The TDB_VOLATILE flag may still be useful as a
-hint that store and delete of records will be at least as common
-as fetch in order to allow some internal tuning, but initially
-will become a no-op.
-
-2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
-  In The Same Process
-
-No process can open the same TDB twice; we check and disallow it.
-This is an unfortunate side-effect of fcntl locks, which operate
-on a per-file rather than per-file-descriptor basis, and do not
-nest. Thus, closing any file descriptor on a file clears all the
-locks obtained by this process, even if they were placed using a
-different file descriptor!
-
-Note that even if this were solved, deadlock could occur if
-operations were nested: this is a more manageable programming
-error in most cases.
-
-2.6.1 Proposed Solution
-
-We could lobby POSIX to fix the perverse rules, or at least lobby
-Linux to violate them so that the most common implementation does
-not have this restriction. This would be a generally good idea
-for other fcntl lock users.
-
-Samba uses a wrapper which hands out the same tdb_context to
-multiple callers if this happens, and does simple reference
-counting. We should do this inside the tdb library, which already
-emulates lock nesting internally; it would need to recognize when
-deadlock occurs within a single process. This would create a new
-failure mode for tdb operations (while we currently handle
-locking failures, they are impossible in normal use and a process
-encountering them can do little but give up).
-
-I do not see benefit in an additional tdb_open flag to indicate
-whether re-opening is allowed, as though there may be some
-benefit to adding a call to detect when a tdb_context is shared,
-to allow other to create such an API.
-
-2.7 TDB API Is Not POSIX Thread-safe
-
-The TDB API uses an error code which can be queried after an
-operation to determine what went wrong. This programming model
-does not work with threads, unless specific additional guarantees
-are given by the implementation. In addition, even
-otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
-).
-
-2.7.1 Proposed Solution
-
-Reachitecting the API to include a tdb_errcode pointer would be a
-great deal of churn; we are better to guarantee that the
-tdb_errcode is per-thread so the current programming model can be
-maintained.
-
-This requires dynamic per-thread allocations, which is awkward
-with POSIX threads (pthread_key_create space is limited and we
-cannot simply allocate a key for every TDB).
-
-Internal locking is required to make sure that fcntl locks do not
-overlap between threads, and also that the global list of tdbs is
-maintained.
-
-The aim is that building tdb with -DTDB_PTHREAD will result in a
-pthread-safe version of the library, and otherwise no overhead
-will exist.
-
-2.8 *_nonblock Functions And *_mark Functions Expose
-  Implementation
-
-CTDB[footnote:
-Clustered TDB, see http://ctdb.samba.org
-] wishes to operate on TDB in a non-blocking manner. This is
-currently done as follows:
-
-1. Call the _nonblock variant of an API function (eg.
-  tdb_lockall_nonblock). If this fails:
-
-2. Fork a child process, and wait for it to call the normal
-  variant (eg. tdb_lockall).
-
-3. If the child succeeds, call the _mark variant to indicate we
-  already have the locks (eg. tdb_lockall_mark).
-
-4. Upon completion, tell the child to release the locks (eg.
-  tdb_unlockall).
-
-5. Indicate to tdb that it should consider the locks removed (eg.
-  tdb_unlockall_mark).
-
-There are several issues with this approach. Firstly, adding two
-new variants of each function clutters the API for an obscure
-use, and so not all functions have three variants. Secondly, it
-assumes that all paths of the functions ask for the same locks,
-otherwise the parent process will have to get a lock which the
-child doesn't have under some circumstances. I don't believe this
-is currently the case, but it constrains the implementation.
-
-2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
-
-Implement a hook for locking methods, so that the caller can
-control the calls to create and remove fcntl locks. In this
-scenario, ctdbd would operate as follows:
-
-1. Call the normal API function, eg tdb_lockall().
-
-2. When the lock callback comes in, check if the child has the
-  lock. Initially, this is always false. If so, return 0.
-  Otherwise, try to obtain it in non-blocking mode. If that
-  fails, return EWOULDBLOCK.
-
-3. Release locks in the unlock callback as normal.
-
-4. If tdb_lockall() fails, see if we recorded a lock failure; if
-  so, call the child to repeat the operation.
-
-5. The child records what locks it obtains, and returns that
-  information to the parent.
-
-6. When the child has succeeded, goto 1.
-
-This is flexible enough to handle any potential locking scenario,
-even when lock requirements change. It can be optimized so that
-the parent does not release locks, just tells the child which
-locks it doesn't need to obtain.
-
-It also keeps the complexity out of the API, and in ctdbd where
-it is needed.
-
-2.9 tdb_chainlock Functions Expose Implementation
-
-tdb_chainlock locks some number of records, including the record
-indicated by the given key. This gave atomicity guarantees;
-no-one can start a transaction, alter, read or delete that key
-while the lock is held.
-
-It also makes the same guarantee for any other key in the chain,
-which is an internal implementation detail and potentially a
-cause for deadlock.
-
-2.9.1 Proposed Solution
-
-None. It would be nice to have an explicit single entry lock
-which effected no other keys. Unfortunately, this won't work for
-an entry which doesn't exist. Thus while chainlock may be
-implemented more efficiently for the existing case, it will still
-have overlap issues with the non-existing case. So it is best to
-keep the current (lack of) guarantee about which records will be
-effected to avoid constraining our implementation.
-
-2.10 Signal Handling is Not Race-Free
-
-The tdb_setalarm_sigptr() call allows the caller's signal handler
-to indicate that the tdb locking code should return with a
-failure, rather than trying again when a signal is received (and
-errno == EAGAIN). This is usually used to implement timeouts.
-
-Unfortunately, this does not work in the case where the signal is
-received before the tdb code enters the fcntl() call to place the
-lock: the code will sleep within the fcntl() code, unaware that
-the signal wants it to exit. In the case of long timeouts, this
-does not happen in practice.
-
-2.10.1 Proposed Solution
-
-The locking hooks proposed in[Proposed-Solution-locking-hook]
-would allow the user to decide on whether to fail the lock
-acquisition on a signal. This allows the caller to choose their
-own compromise: they could narrow the race by checking
-immediately before the fcntl call.[footnote:
-It may be possible to make this race-free in some implementations
-by having the signal handler alter the struct flock to make it
-invalid. This will cause the fcntl() lock call to fail with
-EINVAL if the signal occurs before the kernel is entered,
-otherwise EAGAIN.
-]
-
-2.11 The API Uses Gratuitous Typedefs, Capitals
-
-typedefs are useful for providing source compatibility when types
-can differ across implementations, or arguably in the case of
-function pointer definitions which are hard for humans to parse.
-Otherwise it is simply obfuscation and pollutes the namespace.
-
-Capitalization is usually reserved for compile-time constants and
-macros.
-
-  TDB_CONTEXT There is no reason to use this over 'struct
-  tdb_context'; the definition isn't visible to the API user
-  anyway.
-
-  TDB_DATA There is no reason to use this over struct TDB_DATA;
-  the struct needs to be understood by the API user.
-
-  struct TDB_DATA This would normally be called 'struct
-  tdb_data'.
-
-  enum TDB_ERROR Similarly, this would normally be enum
-  tdb_error.
-
-2.11.1 Proposed Solution
-
-None. Introducing lower case variants would please pedants like
-myself, but if it were done the existing ones should be kept.
-There is little point forcing a purely cosmetic change upon tdb
-users.
-
-2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
-  Private Pointer
-
-For API compatibility reasons, the logging function needs to call
-tdb_get_logging_private() to retrieve the pointer registered by
-the tdb_open_ex for logging.
-
-2.12.1 Proposed Solution
-
-It should simply take an extra argument, since we are prepared to
-break the API/ABI.
-
-2.13 Various Callback Functions Are Not Typesafe
-
-The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
-and tdb_check all take void * and must internally convert it to
-the argument type they were expecting.
-
-If this type changes, the compiler will not produce warnings on
-the callers, since it only sees void *.
-
-2.13.1 Proposed Solution
-
-With careful use of macros, we can create callback functions
-which give a warning when used on gcc and the types of the
-callback and its private argument differ. Unsupported compilers
-will not give a warning, which is no worse than now. In addition,
-the callbacks become clearer, as they need not use void * for
-their parameter.
-
-See CCAN's typesafe_cb module at
-http://ccan.ozlabs.org/info/typesafe_cb.html
-
-2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
-  tdb_reopen_all Problematic
-
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
-file should be cleared if the caller discovers it is the only
-process with the TDB open. However, if any caller does not
-specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
-the TDB erased underneath them (usually resulting in a crash).
-
-There is a similar issue on fork(); if the parent exits (or
-otherwise closes the tdb) before the child calls tdb_reopen_all()
-to establish the lock used to indicate the TDB is opened by
-someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
-it alone has opened the TDB and will erase it.
-
-2.14.1 Proposed Solution
-
-Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
-see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
-
-3 Performance And Scalability Issues
-
-3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
-  Imposes Performance Penalty
-
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
-placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
-never conflict in normal tdb usage, they do add substantial
-overhead for most fcntl lock implementations when the kernel
-scans to detect if a lock conflict exists. This is often a single
-linked list, making the time to acquire and release a fcntl lock
-O(N) where N is the number of processes with the TDB open, not
-the number actually doing work.
-
-In a Samba server it is common to have huge numbers of clients
-sitting idle, and thus they have weaned themselves off the
-TDB_CLEAR_IF_FIRST flag.[footnote:
-There is a flag to tdb_reopen_all() which is used for this
-optimization: if the parent process will outlive the child, the
-child does not need the ACTIVE_LOCK. This is a workaround for
-this very performance issue.
-]
-
-3.1.1 Proposed Solution
-
-Remove the flag. It was a neat idea, but even trivial servers
-tend to know when they are initializing for the first time and
-can simply unlink the old tdb at that point.
-
-3.2 TDB Files Have a 4G Limit
-
-This seems to be becoming an issue (so much for “trivial”!),
-particularly for ldb.
-
-3.2.1 Proposed Solution
-
-A new, incompatible TDB format which uses 64 bit offsets
-internally rather than 32 bit as now. For simplicity of endian
-conversion (which TDB does on the fly if required), all values
-will be 64 bit on disk. In practice, some upper bits may be used
-for other purposes, but at least 56 bits will be available for
-file offsets.
-
-tdb_open() will automatically detect the old version, and even
-create them if TDB_VERSION6 is specified to tdb_open.
-
-32 bit processes will still be able to access TDBs larger than 4G
-(assuming that their off_t allows them to seek to 64 bits), they
-will gracefully fall back as they fail to mmap. This can happen
-already with large TDBs.
-
-Old versions of tdb will fail to open the new TDB files (since 28
-August 2009, commit 398d0c29290: prior to that any unrecognized
-file format would be erased and initialized as a fresh tdb!)
-
-3.3 TDB Records Have a 4G Limit
-
-This has not been a reported problem, and the API uses size_t
-which can be 64 bit on 64 bit platforms. However, other limits
-may have made such an issue moot.
-
-3.3.1 Proposed Solution
-
-Record sizes will be 64 bit, with an error returned on 32 bit
-platforms which try to access such records (the current
-implementation would return TDB_ERR_OOM in a similar case). It
-seems unlikely that 32 bit keys will be a limitation, so the
-implementation may not support this (see [sub:Records-Incur-A]).
-
-3.4 Hash Size Is Determined At TDB Creation Time
-
-TDB contains a number of hash chains in the header; the number is
-specified at creation time, and defaults to 131. This is such a
-bottleneck on large databases (as each hash chain gets quite
-long), that LDB uses 10,000 for this hash. In general it is
-impossible to know what the 'right' answer is at database
-creation time.
-
-3.4.1 Proposed Solution
-
-After comprehensive performance testing on various scalable hash
-variants[footnote:
-http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
-This was annoying because I was previously convinced that an
-expanding tree of hashes would be very close to optimal.
-], it became clear that it is hard to beat a straight linear hash
-table which doubles in size when it reaches saturation. There are
-three details which become important:
-
-1. On encountering a full bucket, we use the next bucket.
-
-2. Extra hash bits are stored with the offset, to reduce
-  comparisons.
-
-3. A marker entry is used on deleting an entry.
-
-The doubling of the table must be done under a transaction; we
-will not reduce it on deletion, so it will be an unusual case. It
-will either be placed at the head (other entries will be moved
-out the way so we can expand). We could have a pointer in the
-header to the current hashtable location, but that pointer would
-have to be read frequently to check for hashtable moves.
-
-The locking for this is slightly more complex than the chained
-case; we currently have one lock per bucket, and that means we
-would need to expand the lock if we overflow to the next bucket.
-The frequency of such collisions will effect our locking
-heuristics: we can always lock more buckets than we need.
-
-One possible optimization is to only re-check the hash size on an
-insert or a lookup miss.
-
-3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
-
-TDB uses a single linked list for the free list. Allocation
-occurs as follows, using heuristics which have evolved over time:
-
-1. Get the free list lock for this whole operation.
-
-2. Multiply length by 1.25, so we always over-allocate by 25%.
-
-3. Set the slack multiplier to 1.
-
-4. Examine the current freelist entry: if it is > length but <
-  the current best case, remember it as the best case.
-
-5. Multiply the slack multiplier by 1.05.
-
-6. If our best fit so far is less than length * slack multiplier,
-  return it. The slack will be turned into a new free record if
-  it's large enough.
-
-7. Otherwise, go onto the next freelist entry.
-
-Deleting a record occurs as follows:
-
-1. Lock the hash chain for this whole operation.
-
-2. Walk the chain to find the record, keeping the prev pointer
-  offset.
-
-3. If max_dead is non-zero:
-
-  (a) Walk the hash chain again and count the dead records.
-
-  (b) If it's more than max_dead, bulk free all the dead ones
-    (similar to steps 4 and below, but the lock is only obtained
-    once).
-
-  (c) Simply mark this record as dead and return.
-
-4. Get the free list lock for the remainder of this operation.
-
-5. <right-merging>Examine the following block to see if it is
-  free; if so, enlarge the current block and remove that block
-  from the free list. This was disabled, as removal from the free
-  list was O(entries-in-free-list).
-
-6. Examine the preceeding block to see if it is free: for this
-  reason, each block has a 32-bit tailer which indicates its
-  length. If it is free, expand it to cover our new block and
-  return.
-
-7. Otherwise, prepend ourselves to the free list.
-
-Disabling right-merging (step [right-merging]) causes
-fragmentation; the other heuristics proved insufficient to
-address this, so the final answer to this was that when we expand
-the TDB file inside a transaction commit, we repack the entire
-tdb.
-
-The single list lock limits our allocation rate; due to the other
-issues this is not currently seen as a bottleneck.
-
-3.5.1 Proposed Solution
-
-The first step is to remove all the current heuristics, as they
-obviously interact, then examine them once the lock contention is
-addressed.
-
-The free list must be split to reduce contention. Assuming
-perfect free merging, we can at most have 1 free list entry for
-each entry. This implies that the number of free lists is related
-to the size of the hash table, but as it is rare to walk a large
-number of free list entries we can use far fewer, say 1/32 of the
-number of hash buckets.
-
-There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
-) but it's not clear this would reduce contention in the common
-case where all processes are allocating/freeing the same size.
-Thus we almost certainly need to divide in other ways: the most
-obvious is to divide the file into zones, and using a free list
-(or set of free lists) for each. This approximates address
-ordering.
-
-Note that this means we need to split the free lists when we
-expand the file; this is probably acceptable when we double the
-hash table size, since that is such an expensive operation
-already. In the case of increasing the file size, there is an
-optimization we can use: if we use M in the formula above as the
-file size rounded up to the next power of 2, we only need
-reshuffle free lists when the file size crosses a power of 2
-boundary, and reshuffling the free lists is trivial: we simply
-merge every consecutive pair of free lists.
-
-The basic algorithm is as follows. Freeing is simple:
-
-1. Identify the correct zone.
-
-2. Lock the corresponding list.
-
-3. Re-check the zone (we didn't have a lock, sizes could have
-  changed): relock if necessary.
-
-4. Place the freed entry in the list for that zone.
-
-Allocation is a little more complicated, as we perform delayed
-coalescing at this point:
-
-1. Pick a zone either the zone we last freed into, or based on a “
-  random” number.
-
-2. Lock the corresponding list.
-
-3. Re-check the zone: relock if necessary.
-
-4. If the top entry is -large enough, remove it from the list and
-  return it.
-
-5. Otherwise, coalesce entries in the list.
-
-  (a)
-
-  (b)
-
-  (c)
-
-  (d)
-
-6. If there was no entry large enough, unlock the list and try
-  the next zone.
-
-7.
-
-8.
-
-9. If no zone satisfies, expand the file.
-
-This optimizes rapid insert/delete of free list entries by not
-coalescing them all the time.. First-fit address ordering
-ordering seems to be fairly good for keeping fragmentation low
-(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
-does not need a tailer to coalesce, though if we needed one we
-could have one cheaply: see [sub:Records-Incur-A].
-
-
-
-I anticipate that the number of entries in each free zone would
-be small, but it might be worth using one free entry to hold
-pointers to the others for cache efficiency.
-
-3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
-
-Much of this is a result of allocation strategy[footnote:
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
-ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
-] and deliberate hobbling of coalescing; internal fragmentation
-(aka overallocation) is deliberately set at 25%, and external
-fragmentation is only cured by the decision to repack the entire
-db when a transaction commit needs to enlarge the file.
-
-3.6.1 Proposed Solution
-
-The 25% overhead on allocation works in practice for ldb because
-indexes tend to expand by one record at a time. This internal
-fragmentation can be resolved by having an “expanded” bit in the
-header to note entries that have previously expanded, and
-allocating more space for them.
-
-There are is a spectrum of possible solutions for external
-fragmentation: one is to use a fragmentation-avoiding allocation
-strategy such as best-fit address-order allocator. The other end
-of the spectrum would be to use a bump allocator (very fast and
-simple) and simply repack the file when we reach the end.
-
-There are three problems with efficient fragmentation-avoiding
-allocators: they are non-trivial, they tend to use a single free
-list for each size, and there's no evidence that tdb allocation
-patterns will match those recorded for general allocators (though
-it seems likely).
-
-Thus we don't spend too much effort on external fragmentation; we
-will be no worse than the current code if we need to repack on
-occasion. More effort is spent on reducing freelist contention,
-and reducing overhead.
-
-3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
-
-Each TDB record has a header as follows:
-
-struct tdb_record {
-
-        tdb_off_t next; /* offset of the next record in the list
-*/
-
-        tdb_len_t rec_len; /* total byte length of record */
-
-        tdb_len_t key_len; /* byte length of key */
-
-        tdb_len_t data_len; /* byte length of data */
-
-        uint32_t full_hash; /* the full 32 bit hash of the key */
-
-        uint32_t magic;   /* try to catch errors */
-
-        /* the following union is implied:
-
-                union {
-
-                        char record[rec_len];
-
-                        struct {
-
-                                char key[key_len];
-
-                                char data[data_len];
-
-                        }
-
-                        uint32_t totalsize; (tailer)
-
-                }
-
-        */
-
-};
-
-Naively, this would double to a 56-byte overhead on a 64 bit
-implementation.
-
-3.7.1 Proposed Solution
-
-We can use various techniques to reduce this for an allocated
-block:
-
-1. The 'next' pointer is not required, as we are using a flat
-  hash table.
-
-2. 'rec_len' can instead be expressed as an addition to key_len
-  and data_len (it accounts for wasted or overallocated length in
-  the record). Since the record length is always a multiple of 8,
-  we can conveniently fit it in 32 bits (representing up to 35
-  bits).
-
-3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
-  restrict 'data_len' to 32 bits, but instead we can combine the
-  two into one 64-bit field and using a 5 bit value which
-  indicates at what bit to divide the two. Keys are unlikely to
-  scale as fast as data, so I'm assuming a maximum key size of 32
-  bits.
-
-4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
-  this is diminishing returns after a handful of bits (at 10
-  bits, it reduces 99.9% of false memcmp). As an aside, as the
-  lower bits are already incorporated in the hash table
-  resolution, the upper bits should be used here.
-
-5. 'magic' does not need to be enlarged: it currently reflects
-  one of 5 values (used, free, dead, recovery, and
-  unused_recovery). It is useful for quick sanity checking
-  however, and should not be eliminated.
-
-6. 'tailer' is only used to coalesce free blocks (so a block to
-  the right can find the header to check if this block is free).
-  This can be replaced by a single 'free' bit in the header of
-  the following block (and the tailer only exists in free
-  blocks).[footnote:
-This technique from Thomas Standish. Data Structure Techniques.
-Addison-Wesley, Reading, Massachusetts, 1980.
-] The current proposed coalescing algorithm doesn't need this,
-  however.
-
-This produces a 16 byte used header like this:
-
-struct tdb_used_record {
-
-        uint32_t magic : 16,
-
-                 prev_is_free: 1,
-
-                 key_data_divide: 5,
-
-                 top_hash: 10;
-
-        uint32_t extra_octets;
-
-        uint64_t key_and_data_len;
-
-};
-
-And a free record like this:
-
-struct tdb_free_record {
-
-        uint32_t free_magic;
-
-        uint64_t total_length;
-
-        ...
-
-        uint64_t tailer;
-
-};
-
-
-
-3.8 Transaction Commit Requires 4 fdatasync
-
-The current transaction algorithm is:
-
-1. write_recovery_data();
-
-2. sync();
-
-3. write_recovery_header();
-
-4. sync();
-
-5. overwrite_with_new_data();
-
-6. sync();
-
-7. remove_recovery_header();
-
-8. sync();
-
-On current ext3, each sync flushes all data to disk, so the next
-3 syncs are relatively expensive. But this could become a
-performance bottleneck on other filesystems such as ext4.
-
-3.8.1 Proposed Solution
-
-
-
-
-
-
-
-
-
-Neil Brown points out that this is overzealous, and only one sync
-is needed:
-
-1. Bundle the recovery data, a transaction counter and a strong
-  checksum of the new data.
-
-2. Strong checksum that whole bundle.
-
-3. Store the bundle in the database.
-
-4. Overwrite the oldest of the two recovery pointers in the
-  header (identified using the transaction counter) with the
-  offset of this bundle.
-
-5. sync.
-
-6. Write the new data to the file.
-
-Checking for recovery means identifying the latest bundle with a
-valid checksum and using the new data checksum to ensure that it
-has been applied. This is more expensive than the current check,
-but need only be done at open. For running databases, a separate
-header field can be used to indicate a transaction in progress;
-we need only check for recovery if this is set.
-
-3.9 TDB Does Not Have Snapshot Support
-
-3.9.1 Proposed Solution
-
-None. At some point you say “use a real database”.
-
-But as a thought experiment, if we implemented transactions to
-only overwrite free entries (this is tricky: there must not be a
-header in each entry which indicates whether it is free, but use
-of presence in metadata elsewhere), and a pointer to the hash
-table, we could create an entirely new commit without destroying
-existing data. Then it would be easy to implement snapshots in a
-similar way.
-
-This would not allow arbitrary changes to the database, such as
-tdb_repack does, and would require more space (since we have to
-preserve the current and future entries at once). If we used hash
-trees rather than one big hash table, we might only have to
-rewrite some sections of the hash, too.
-
-We could then implement snapshots using a similar method, using
-multiple different hash tables/free tables.
-
-3.10 Transactions Cannot Operate in Parallel
-
-This would be useless for ldb, as it hits the index records with
-just about every update. It would add significant complexity in
-resolving clashes, and cause the all transaction callers to write
-their code to loop in the case where the transactions spuriously
-failed.
-
-3.10.1 Proposed Solution
-
-We could solve a small part of the problem by providing read-only
-transactions. These would allow one write transaction to begin,
-but it could not commit until all r/o transactions are done. This
-would require a new RO_TRANSACTION_LOCK, which would be upgraded
-on commit.
-
-3.11 Default Hash Function Is Suboptimal
-
-The Knuth-inspired multiplicative hash used by tdb is fairly slow
-(especially if we expand it to 64 bits), and works best when the
-hash bucket size is a prime number (which also means a slow
-modulus). In addition, it is highly predictable which could
-potentially lead to a Denial of Service attack in some TDB uses.
-
-3.11.1 Proposed Solution
-
-The Jenkins lookup3 hash[footnote:
-http://burtleburtle.net/bob/c/lookup3.c
-] is a fast and superbly-mixing hash. It's used by the Linux
-kernel and almost everything else. This has the particular
-properties that it takes an initial seed, and produces two 32 bit
-hash numbers, which we can combine into a 64-bit hash.
-
-The seed should be created at tdb-creation time from some random
-source, and placed in the header. This is far from foolproof, but
-adds a little bit of protection against hash bombing.
-
-3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
-
-We lock a record during traversal iteration, and try to grab that
-lock in the delete code. If that grab on delete fails, we simply
-mark it deleted and continue onwards; traversal checks for this
-condition and does the delete when it moves off the record.
-
-If traversal terminates, the dead record may be left
-indefinitely.
-
-3.12.1 Proposed Solution
-
-Remove reliability guarantees; see [traverse-Proposed-Solution].
-
-3.13 Fcntl Locking Adds Overhead
-
-Placing a fcntl lock means a system call, as does removing one.
-This is actually one reason why transactions can be faster
-(everything is locked once at transaction start). In the
-uncontended case, this overhead can theoretically be eliminated.
-
-3.13.1 Proposed Solution
-
-None.
-
-We tried this before with spinlock support, in the early days of
-TDB, and it didn't make much difference except in manufactured
-benchmarks.
-
-We could use spinlocks (with futex kernel support under Linux),
-but it means that we lose automatic cleanup when a process dies
-with a lock. There is a method of auto-cleanup under Linux, but
-it's not supported by other operating systems. We could
-reintroduce a clear-if-first-style lock and sweep for dead
-futexes on open, but that wouldn't help the normal case of one
-concurrent opener dying. Increasingly elaborate repair schemes
-could be considered, but they require an ABI change (everyone
-must use them) anyway, so there's no need to do this at the same
-time as everything else.
diff --git a/lib/tdb2/doc/design.lyx b/lib/tdb2/doc/design.lyx
deleted file mode 100644
index 0a1d6a14bc..0000000000
--- a/lib/tdb2/doc/design.lyx
+++ /dev/null
@@ -1,2689 +0,0 @@
-#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
-\lyxformat 345
-\begin_document
-\begin_header
-\textclass article
-\use_default_options true
-\language english
-\inputencoding auto
-\font_roman default
-\font_sans default
-\font_typewriter default
-\font_default_family default
-\font_sc false
-\font_osf false
-\font_sf_scale 100
-\font_tt_scale 100
-
-\graphics default
-\paperfontsize default
-\use_hyperref false
-\papersize default
-\use_geometry false
-\use_amsmath 1
-\use_esint 1
-\cite_engine basic
-\use_bibtopic false
-\paperorientation portrait
-\secnumdepth 3
-\tocdepth 3
-\paragraph_separation indent
-\defskip medskip
-\quotes_language english
-\papercolumns 1
-\papersides 1
-\paperpagestyle default
-\tracking_changes true
-\output_changes true
-\author ""
-\author ""
-\end_header
-
-\begin_body
-
-\begin_layout Title
-TDB2: A Redesigning The Trivial DataBase
-\end_layout
-
-\begin_layout Author
-Rusty Russell, IBM Corporation
-\end_layout
-
-\begin_layout Date
-17-March-2011
-\end_layout
-
-\begin_layout Abstract
-The Trivial DataBase on-disk format is 32 bits; with usage cases heading
- towards the 4G limit, that must change.
- This required breakage provides an opportunity to revisit TDB's other design
- decisions and reassess them.
-\end_layout
-
-\begin_layout Section
-Introduction
-\end_layout
-
-\begin_layout Standard
-The Trivial DataBase was originally written by Andrew Tridgell as a simple
- key/data pair storage system with the same API as dbm, but allowing multiple
- readers and writers while being small enough (< 1000 lines of C) to include
- in SAMBA.
- The simple design created in 1999 has proven surprisingly robust and performant
-, used in Samba versions 3 and 4 as well as numerous other projects.
- Its useful life was greatly increased by the (backwards-compatible!) addition
- of transaction support in 2005.
-\end_layout
-
-\begin_layout Standard
-The wider variety and greater demands of TDB-using code has lead to some
- organic growth of the API, as well as some compromises on the implementation.
- None of these, by themselves, are seen as show-stoppers, but the cumulative
- effect is to a loss of elegance over the initial, simple TDB implementation.
- Here is a table of the approximate number of lines of implementation code
- and number of API functions at the end of each year:
-\end_layout
-
-\begin_layout Standard
-\begin_inset Tabular
-<lyxtabular version="3" rows="12" columns="3">
-<features>
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<row>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-Year End
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-API Functions
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-Lines of C Code Implementation
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1999
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-13
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1195
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2000
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-24
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1725
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2001
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-32
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2228
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2002
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-35
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2481
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2003
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-35
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2552
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2004
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-40
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2584
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2005
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-38
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2647
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2006
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-52
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-3754
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2007
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-66
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-4398
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2008
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-71
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-4768
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2009
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-73
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-5715
-\end_layout
-
-\end_inset
-</cell>
-</row>
-</lyxtabular>
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-This review is an attempt to catalog and address all the known issues with
- TDB and create solutions which address the problems without significantly
- increasing complexity; all involved are far too aware of the dangers of
- second system syndrome in rewriting a successful project like this.
-\end_layout
-
-\begin_layout Section
-API Issues
-\end_layout
-
-\begin_layout Subsection
-tdb_open_ex Is Not Expandable
-\end_layout
-
-\begin_layout Standard
-The tdb_open() call was expanded to tdb_open_ex(), which added an optional
- hashing function and an optional logging function argument.
- Additional arguments to open would require the introduction of a tdb_open_ex2
- call etc.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "attributes"
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-tdb_open() will take a linked-list of attributes:
-\end_layout
-
-\begin_layout LyX-Code
-enum tdb_attribute {
-\end_layout
-
-\begin_layout LyX-Code
-    TDB_ATTRIBUTE_LOG = 0,
-\end_layout
-
-\begin_layout LyX-Code
-    TDB_ATTRIBUTE_HASH = 1
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_base {
-\end_layout
-
-\begin_layout LyX-Code
-    enum tdb_attribute attr;
-\end_layout
-
-\begin_layout LyX-Code
-    union tdb_attribute *next;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_log {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
-\end_layout
-
-\begin_layout LyX-Code
-    tdb_log_func log_fn;
-\end_layout
-
-\begin_layout LyX-Code
-    void *log_private;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_hash {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
-\end_layout
-
-\begin_layout LyX-Code
-    tdb_hash_func hash_fn;
-\end_layout
-
-\begin_layout LyX-Code
-    void *hash_private;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-union tdb_attribute {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base;
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_log log;
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_hash hash;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-This allows future attributes to be added, even if this expands the size
- of the union.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_traverse Makes Impossible Guarantees
-\end_layout
-
-\begin_layout Standard
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
- was thought that it was important to guarantee that all records which exist
- at the start and end of the traversal would be included, and no record
- would be included twice.
-\end_layout
-
-\begin_layout Standard
-This adds complexity (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Reliable-Traversal-Adds"
-
-\end_inset
-
-) and does not work anyway for records which are altered (in particular,
- those which are expanded may be effectively deleted and re-added behind
- the traversal).
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "traverse-Proposed-Solution"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Abandon the guarantee.
- You will see every record if no changes occur during your traversal, otherwise
- you will see some subset.
- You can prevent changes by using a transaction or the locking API.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- Delete-during-traverse will still delete every record, too (assuming no
- other changes).
-\end_layout
-
-\begin_layout Subsection
-Nesting of Transactions Is Fraught
-\end_layout
-
-\begin_layout Standard
-TDB has alternated between allowing nested transactions and not allowing
- them.
- Various paths in the Samba codebase assume that transactions will nest,
- and in a sense they can: the operation is only committed to disk when the
- outer transaction is committed.
- There are two problems, however:
-\end_layout
-
-\begin_layout Enumerate
-Canceling the inner transaction will cause the outer transaction commit
- to fail, and will not undo any operations since the inner transaction began.
- This problem is soluble with some additional internal code.
-\end_layout
-
-\begin_layout Enumerate
-An inner transaction commit can be cancelled by the outer transaction.
- This is desirable in the way which Samba's database initialization code
- uses transactions, but could be a surprise to any users expecting a successful
- transaction commit to expose changes to others.
-\end_layout
-
-\begin_layout Standard
-The current solution is to specify the behavior at tdb_open(), with the
- default currently that nested transactions are allowed.
- This flag can also be changed at runtime.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Given the usage patterns, it seems that the
-\begin_inset Quotes eld
-\end_inset
-
-least-surprise
-\begin_inset Quotes erd
-\end_inset
-
- behavior of disallowing nested transactions should become the default.
- Additionally, it seems the outer transaction is the only code which knows
- whether inner transactions should be allowed, so a flag to indicate this
- could be added to tdb_transaction_start.
- However, this behavior can be simulated with a wrapper which uses tdb_add_flags
-() and tdb_remove_flags(), so the API should not be expanded for this relatively
--obscure case.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete; the nesting flag has been removed.
-\end_layout
-
-\begin_layout Subsection
-Incorrect Hash Function is Not Detected
-\end_layout
-
-\begin_layout Standard
-tdb_open_ex() allows the calling code to specify a different hash function
- to use, but does not check that all other processes accessing this tdb
- are using the same hash function.
- The result is that records are missing from tdb_fetch().
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The header should contain an example hash result (eg.
- the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
- hash function produces the same answer, or fail the tdb_open call.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-\end_layout
-
-\begin_layout Standard
-In response to scalability issues with the free list (
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Freelist-Is"
-
-\end_inset
-
-) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
- and the TDB_VOLATILE flag to tdb_open.
- The latter actually calls the former with an argument of
-\begin_inset Quotes eld
-\end_inset
-
-5
-\begin_inset Quotes erd
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Standard
-This code allows deleted records to accumulate without putting them in the
- free list.
- On delete we iterate through each chain and free them in a batch if there
- are more than max_dead entries.
- These are never otherwise recycled except as a side-effect of a tdb_repack.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-With the scalability problems of the freelist solved, this API can be removed.
- The TDB_VOLATILE flag may still be useful as a hint that store and delete
- of records will be at least as common as fetch in order to allow some internal
- tuning, but initially will become a no-op.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- Unknown flags cause tdb_open() to fail as well, so they can be detected
- at runtime.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Files-Cannot"
-
-\end_inset
-
-TDB Files Cannot Be Opened Multiple Times In The Same Process
-\end_layout
-
-\begin_layout Standard
-No process can open the same TDB twice; we check and disallow it.
- This is an unfortunate side-effect of fcntl locks, which operate on a per-file
- rather than per-file-descriptor basis, and do not nest.
- Thus, closing any file descriptor on a file clears all the locks obtained
- by this process, even if they were placed using a different file descriptor!
-\end_layout
-
-\begin_layout Standard
-Note that even if this were solved, deadlock could occur if operations were
- nested: this is a more manageable programming error in most cases.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We could lobby POSIX to fix the perverse rules, or at least lobby Linux
- to violate them so that the most common implementation does not have this
- restriction.
- This would be a generally good idea for other fcntl lock users.
-\end_layout
-
-\begin_layout Standard
-Samba uses a wrapper which hands out the same tdb_context to multiple callers
- if this happens, and does simple reference counting.
- We should do this inside the tdb library, which already emulates lock nesting
- internally; it would need to recognize when deadlock occurs within a single
- process.
- This would create a new failure mode for tdb operations (while we currently
- handle locking failures, they are impossible in normal use and a process
- encountering them can do little but give up).
-\end_layout
-
-\begin_layout Standard
-I do not see benefit in an additional tdb_open flag to indicate whether
- re-opening is allowed, as though there may be some benefit to adding a
- call to detect when a tdb_context is shared, to allow other to create such
- an API.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB API Is Not POSIX Thread-safe
-\end_layout
-
-\begin_layout Standard
-The TDB API uses an error code which can be queried after an operation to
- determine what went wrong.
- This programming model does not work with threads, unless specific additional
- guarantees are given by the implementation.
- In addition, even otherwise-independent threads cannot open the same TDB
- (as in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Files-Cannot"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Reachitecting the API to include a tdb_errcode pointer would be a great
- deal of churn, but fortunately most functions return 0 on success and -1
- on error: we can change these to return 0 on success and a negative error
- code on error, and the API remains similar to previous.
- The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
- pointer and return an error code.
- It is also simpler to have tdb_nextkey replace its key argument in place,
- freeing up any old .dptr.
-\end_layout
-
-\begin_layout Standard
-Internal locking is required to make sure that fcntl locks do not overlap
- between threads, and also that the global list of tdbs is maintained.
-\end_layout
-
-\begin_layout Standard
-The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
- version of the library, and otherwise no overhead will exist.
- Alternatively, a hooking mechanism similar to that proposed for
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
-
-\end_inset
-
- could be used to enable pthread locking at runtime.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete; API has been changed but thread safety has not been implemented.
-\end_layout
-
-\begin_layout Subsection
-*_nonblock Functions And *_mark Functions Expose Implementation
-\end_layout
-
-\begin_layout Standard
-CTDB
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-Clustered TDB, see http://ctdb.samba.org
-\end_layout
-
-\end_inset
-
- wishes to operate on TDB in a non-blocking manner.
- This is currently done as follows:
-\end_layout
-
-\begin_layout Enumerate
-Call the _nonblock variant of an API function (eg.
- tdb_lockall_nonblock).
- If this fails:
-\end_layout
-
-\begin_layout Enumerate
-Fork a child process, and wait for it to call the normal variant (eg.
- tdb_lockall).
-\end_layout
-
-\begin_layout Enumerate
-If the child succeeds, call the _mark variant to indicate we already have
- the locks (eg.
- tdb_lockall_mark).
-\end_layout
-
-\begin_layout Enumerate
-Upon completion, tell the child to release the locks (eg.
- tdb_unlockall).
-\end_layout
-
-\begin_layout Enumerate
-Indicate to tdb that it should consider the locks removed (eg.
- tdb_unlockall_mark).
-\end_layout
-
-\begin_layout Standard
-There are several issues with this approach.
- Firstly, adding two new variants of each function clutters the API for
- an obscure use, and so not all functions have three variants.
- Secondly, it assumes that all paths of the functions ask for the same locks,
- otherwise the parent process will have to get a lock which the child doesn't
- have under some circumstances.
- I don't believe this is currently the case, but it constrains the implementatio
-n.
-
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Proposed-Solution-locking-hook"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Implement a hook for locking methods, so that the caller can control the
- calls to create and remove fcntl locks.
- In this scenario, ctdbd would operate as follows:
-\end_layout
-
-\begin_layout Enumerate
-Call the normal API function, eg tdb_lockall().
-\end_layout
-
-\begin_layout Enumerate
-When the lock callback comes in, check if the child has the lock.
- Initially, this is always false.
- If so, return 0.
- Otherwise, try to obtain it in non-blocking mode.
- If that fails, return EWOULDBLOCK.
-\end_layout
-
-\begin_layout Enumerate
-Release locks in the unlock callback as normal.
-\end_layout
-
-\begin_layout Enumerate
-If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
- child to repeat the operation.
-\end_layout
-
-\begin_layout Enumerate
-The child records what locks it obtains, and returns that information to
- the parent.
-\end_layout
-
-\begin_layout Enumerate
-When the child has succeeded, goto 1.
-\end_layout
-
-\begin_layout Standard
-This is flexible enough to handle any potential locking scenario, even when
- lock requirements change.
- It can be optimized so that the parent does not release locks, just tells
- the child which locks it doesn't need to obtain.
-\end_layout
-
-\begin_layout Standard
-It also keeps the complexity out of the API, and in ctdbd where it is needed.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
-\end_layout
-
-\begin_layout Subsection
-tdb_chainlock Functions Expose Implementation
-\end_layout
-
-\begin_layout Standard
-tdb_chainlock locks some number of records, including the record indicated
- by the given key.
- This gave atomicity guarantees; no-one can start a transaction, alter,
- read or delete that key while the lock is held.
-\end_layout
-
-\begin_layout Standard
-It also makes the same guarantee for any other key in the chain, which is
- an internal implementation detail and potentially a cause for deadlock.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- It would be nice to have an explicit single entry lock which effected no
- other keys.
- Unfortunately, this won't work for an entry which doesn't exist.
- Thus while chainlock may be implemented more efficiently for the existing
- case, it will still have overlap issues with the non-existing case.
- So it is best to keep the current (lack of) guarantee about which records
- will be effected to avoid constraining our implementation.
-\end_layout
-
-\begin_layout Subsection
-Signal Handling is Not Race-Free
-\end_layout
-
-\begin_layout Standard
-The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
- that the tdb locking code should return with a failure, rather than trying
- again when a signal is received (and errno == EAGAIN).
- This is usually used to implement timeouts.
-\end_layout
-
-\begin_layout Standard
-Unfortunately, this does not work in the case where the signal is received
- before the tdb code enters the fcntl() call to place the lock: the code
- will sleep within the fcntl() code, unaware that the signal wants it to
- exit.
- In the case of long timeouts, this does not happen in practice.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The locking hooks proposed in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
-
-\end_inset
-
- would allow the user to decide on whether to fail the lock acquisition
- on a signal.
- This allows the caller to choose their own compromise: they could narrow
- the race by checking immediately before the fcntl call.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-It may be possible to make this race-free in some implementations by having
- the signal handler alter the struct flock to make it invalid.
- This will cause the fcntl() lock call to fail with EINVAL if the signal
- occurs before the kernel is entered, otherwise EAGAIN.
-\end_layout
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
-\end_layout
-
-\begin_layout Subsection
-The API Uses Gratuitous Typedefs, Capitals
-\end_layout
-
-\begin_layout Standard
-typedefs are useful for providing source compatibility when types can differ
- across implementations, or arguably in the case of function pointer definitions
- which are hard for humans to parse.
- Otherwise it is simply obfuscation and pollutes the namespace.
-\end_layout
-
-\begin_layout Standard
-Capitalization is usually reserved for compile-time constants and macros.
-\end_layout
-
-\begin_layout Description
-TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
- definition isn't visible to the API user anyway.
-\end_layout
-
-\begin_layout Description
-TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
- needs to be understood by the API user.
-\end_layout
-
-\begin_layout Description
-struct
-\begin_inset space ~
-\end_inset
-
-TDB_DATA This would normally be called 'struct tdb_data'.
-\end_layout
-
-\begin_layout Description
-enum
-\begin_inset space ~
-\end_inset
-
-TDB_ERROR Similarly, this would normally be enum tdb_error.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- Introducing lower case variants would please pedants like myself, but if
- it were done the existing ones should be kept.
- There is little point forcing a purely cosmetic change upon tdb users.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "tdb_log_func-Doesnt-Take"
-
-\end_inset
-
-tdb_log_func Doesn't Take The Private Pointer
-\end_layout
-
-\begin_layout Standard
-For API compatibility reasons, the logging function needs to call tdb_get_loggin
-g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-It should simply take an extra argument, since we are prepared to break
- the API/ABI.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Various Callback Functions Are Not Typesafe
-\end_layout
-
-\begin_layout Standard
-The callback functions in tdb_set_logging_function (after
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "tdb_log_func-Doesnt-Take"
-
-\end_inset
-
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
- all take void * and must internally convert it to the argument type they
- were expecting.
-\end_layout
-
-\begin_layout Standard
-If this type changes, the compiler will not produce warnings on the callers,
- since it only sees void *.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-With careful use of macros, we can create callback functions which give
- a warning when used on gcc and the types of the callback and its private
- argument differ.
- Unsupported compilers will not give a warning, which is no worse than now.
- In addition, the callbacks become clearer, as they need not use void *
- for their parameter.
-\end_layout
-
-\begin_layout Standard
-See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
-\end_layout
-
-\begin_layout Standard
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
- be cleared if the caller discovers it is the only process with the TDB
- open.
- However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
- be detected, so will have the TDB erased underneath them (usually resulting
- in a crash).
-\end_layout
-
-\begin_layout Standard
-There is a similar issue on fork(); if the parent exits (or otherwise closes
- the tdb) before the child calls tdb_reopen_all() to establish the lock
- used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
- at that moment will believe it alone has opened the TDB and will erase
- it.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove TDB_CLEAR_IF_FIRST.
- Other workarounds are possible, but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Extending The Header Is Difficult
-\end_layout
-
-\begin_layout Standard
-We have reserved (zeroed) words in the TDB header, which can be used for
- future features.
- If the future features are compulsory, the version number must be updated
- to prevent old code from accessing the database.
- But if the future feature is optional, we have no way of telling if older
- code is accessing the database or not.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The header should contain a
-\begin_inset Quotes eld
-\end_inset
-
-format variant
-\begin_inset Quotes erd
-\end_inset
-
- value (64-bit).
- This is divided into two 32-bit parts:
-\end_layout
-
-\begin_layout Enumerate
-The lower part reflects the format variant understood by code accessing
- the database.
-\end_layout
-
-\begin_layout Enumerate
-The upper part reflects the format variant you must understand to write
- to the database (otherwise you can only open for reading).
-\end_layout
-
-\begin_layout Standard
-The latter field can only be written at creation time, the former should
- be written under the OPEN_LOCK when opening the database for writing, if
- the variant of the code is lower than the current lowest variant.
-\end_layout
-
-\begin_layout Standard
-This should allow backwards-compatible features to be added, and detection
- if older code (which doesn't understand the feature) writes to the database.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Record Headers Are Not Expandible
-\end_layout
-
-\begin_layout Standard
-If we later want to add (say) checksums on keys and data, it would require
- another format change, which we'd like to avoid.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We often have extra padding at the tail of a record.
- If we ensure that the first byte (if any) of this padding is zero, we will
- have a way for future changes to detect code which doesn't understand a
- new format: the new code would write (say) a 1 at the tail, and thus if
- there is no tail or the first byte is 0, we would know the extension is
- not present on that record.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Does Not Use Talloc
-\end_layout
-
-\begin_layout Standard
-Many users of TDB (particularly Samba) use the talloc allocator, and thus
- have to wrap TDB in a talloc context to use it conveniently.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The allocation within TDB is not complicated enough to justify the use of
- talloc, and I am reluctant to force another (excellent) library on TDB
- users.
- Nonetheless a compromise is possible.
- An attribute (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-
-\end_inset
-
-) can be added later to tdb_open() to provide an alternate allocation mechanism,
- specifically for talloc but usable by any other allocator (which would
- ignore the
-\begin_inset Quotes eld
-\end_inset
-
-context
-\begin_inset Quotes erd
-\end_inset
-
- argument).
-\end_layout
-
-\begin_layout Standard
-This would form a talloc heirarchy as expected, but the caller would still
- have to attach a destructor to the tdb context returned from tdb_open to
- close it.
- All TDB_DATA fields would be children of the tdb_context, and the caller
- would still have to manage them (using talloc_free() or talloc_steal()).
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Section
-Performance And Scalability Issues
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-
-\end_inset
-
-TDB_CLEAR_IF_FIRST Imposes Performance Penalty
-\end_layout
-
-\begin_layout Standard
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
- 4 (aka.
- the ACTIVE_LOCK).
- While these locks never conflict in normal tdb usage, they do add substantial
- overhead for most fcntl lock implementations when the kernel scans to detect
- if a lock conflict exists.
- This is often a single linked list, making the time to acquire and release
- a fcntl lock O(N) where N is the number of processes with the TDB open,
- not the number actually doing work.
-\end_layout
-
-\begin_layout Standard
-In a Samba server it is common to have huge numbers of clients sitting idle,
- and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-There is a flag to tdb_reopen_all() which is used for this optimization:
- if the parent process will outlive the child, the child does not need the
- ACTIVE_LOCK.
- This is a workaround for this very performance issue.
-\end_layout
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove the flag.
- It was a neat idea, but even trivial servers tend to know when they are
- initializing for the first time and can simply unlink the old tdb at that
- point.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Files Have a 4G Limit
-\end_layout
-
-\begin_layout Standard
-This seems to be becoming an issue (so much for
-\begin_inset Quotes eld
-\end_inset
-
-trivial
-\begin_inset Quotes erd
-\end_inset
-
-!), particularly for ldb.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-A new, incompatible TDB format which uses 64 bit offsets internally rather
- than 32 bit as now.
- For simplicity of endian conversion (which TDB does on the fly if required),
- all values will be 64 bit on disk.
- In practice, some upper bits may be used for other purposes, but at least
- 56 bits will be available for file offsets.
-\end_layout
-
-\begin_layout Standard
-tdb_open() will automatically detect the old version, and even create them
- if TDB_VERSION6 is specified to tdb_open.
-\end_layout
-
-\begin_layout Standard
-32 bit processes will still be able to access TDBs larger than 4G (assuming
- that their off_t allows them to seek to 64 bits), they will gracefully
- fall back as they fail to mmap.
- This can happen already with large TDBs.
-\end_layout
-
-\begin_layout Standard
-Old versions of tdb will fail to open the new TDB files (since 28 August
- 2009, commit 398d0c29290: prior to that any unrecognized file format would
- be erased and initialized as a fresh tdb!)
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Records Have a 4G Limit
-\end_layout
-
-\begin_layout Standard
-This has not been a reported problem, and the API uses size_t which can
- be 64 bit on 64 bit platforms.
- However, other limits may have made such an issue moot.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Record sizes will be 64 bit, with an error returned on 32 bit platforms
- which try to access such records (the current implementation would return
- TDB_ERR_OOM in a similar case).
- It seems unlikely that 32 bit keys will be a limitation, so the implementation
- may not support this (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Hash Size Is Determined At TDB Creation Time
-\end_layout
-
-\begin_layout Standard
-TDB contains a number of hash chains in the header; the number is specified
- at creation time, and defaults to 131.
- This is such a bottleneck on large databases (as each hash chain gets quite
- long), that LDB uses 10,000 for this hash.
- In general it is impossible to know what the 'right' answer is at database
- creation time.
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Hash-Size-Solution"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-After comprehensive performance testing on various scalable hash variants
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
- because I was previously convinced that an expanding tree of hashes would
- be very close to optimal.
-\end_layout
-
-\end_inset
-
-, it became clear that it is hard to beat a straight linear hash table which
- doubles in size when it reaches saturation.
- Unfortunately, altering the hash table introduces serious locking complications
-: the entire hash table needs to be locked to enlarge the hash table, and
- others might be holding locks.
- Particularly insidious are insertions done under tdb_chainlock.
-\end_layout
-
-\begin_layout Standard
-Thus an expanding layered hash will be used: an array of hash groups, with
- each hash group exploding into pointers to lower hash groups once it fills,
- turning into a hash tree.
- This has implications for locking: we must lock the entire group in case
- we need to expand it, yet we don't know how deep the tree is at that point.
-\end_layout
-
-\begin_layout Standard
-Note that bits from the hash table entries should be stolen to hold more
- hash bits to reduce the penalty of collisions.
- We can use the otherwise-unused lower 3 bits.
- If we limit the size of the database to 64 exabytes, we can use the top
- 8 bits of the hash entry as well.
- These 11 bits would reduce false positives down to 1 in 2000 which is more
- than we need: we can use one of the bits to indicate that the extra hash
- bits are valid.
- This means we can choose not to re-hash all entries when we expand a hash
- group; simply use the next bits we need and mark them invalid.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Freelist-Is"
-
-\end_inset
-
-TDB Freelist Is Highly Contended
-\end_layout
-
-\begin_layout Standard
-TDB uses a single linked list for the free list.
- Allocation occurs as follows, using heuristics which have evolved over
- time:
-\end_layout
-
-\begin_layout Enumerate
-Get the free list lock for this whole operation.
-\end_layout
-
-\begin_layout Enumerate
-Multiply length by 1.25, so we always over-allocate by 25%.
-\end_layout
-
-\begin_layout Enumerate
-Set the slack multiplier to 1.
-\end_layout
-
-\begin_layout Enumerate
-Examine the current freelist entry: if it is > length but < the current
- best case, remember it as the best case.
-\end_layout
-
-\begin_layout Enumerate
-Multiply the slack multiplier by 1.05.
-\end_layout
-
-\begin_layout Enumerate
-If our best fit so far is less than length * slack multiplier, return it.
- The slack will be turned into a new free record if it's large enough.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, go onto the next freelist entry.
-\end_layout
-
-\begin_layout Standard
-Deleting a record occurs as follows:
-\end_layout
-
-\begin_layout Enumerate
-Lock the hash chain for this whole operation.
-\end_layout
-
-\begin_layout Enumerate
-Walk the chain to find the record, keeping the prev pointer offset.
-\end_layout
-
-\begin_layout Enumerate
-If max_dead is non-zero:
-\end_layout
-
-\begin_deeper
-\begin_layout Enumerate
-Walk the hash chain again and count the dead records.
-\end_layout
-
-\begin_layout Enumerate
-If it's more than max_dead, bulk free all the dead ones (similar to steps
- 4 and below, but the lock is only obtained once).
-\end_layout
-
-\begin_layout Enumerate
-Simply mark this record as dead and return.
-
-\end_layout
-
-\end_deeper
-\begin_layout Enumerate
-Get the free list lock for the remainder of this operation.
-\end_layout
-
-\begin_layout Enumerate
-\begin_inset CommandInset label
-LatexCommand label
-name "right-merging"
-
-\end_inset
-
-Examine the following block to see if it is free; if so, enlarge the current
- block and remove that block from the free list.
- This was disabled, as removal from the free list was O(entries-in-free-list).
-\end_layout
-
-\begin_layout Enumerate
-Examine the preceeding block to see if it is free: for this reason, each
- block has a 32-bit tailer which indicates its length.
- If it is free, expand it to cover our new block and return.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, prepend ourselves to the free list.
-\end_layout
-
-\begin_layout Standard
-Disabling right-merging (step
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "right-merging"
-
-\end_inset
-
-) causes fragmentation; the other heuristics proved insufficient to address
- this, so the final answer to this was that when we expand the TDB file
- inside a transaction commit, we repack the entire tdb.
-\end_layout
-
-\begin_layout Standard
-The single list lock limits our allocation rate; due to the other issues
- this is not currently seen as a bottleneck.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The first step is to remove all the current heuristics, as they obviously
- interact, then examine them once the lock contention is addressed.
-\end_layout
-
-\begin_layout Standard
-The free list must be split to reduce contention.
- Assuming perfect free merging, we can at most have 1 free list entry for
- each entry.
- This implies that the number of free lists is related to the size of the
- hash table, but as it is rare to walk a large number of free list entries
- we can use far fewer, say 1/32 of the number of hash buckets.
-\end_layout
-
-\begin_layout Standard
-It seems tempting to try to reuse the hash implementation which we use for
- records here, but we have two ways of searching for free entries: for allocatio
-n we search by size (and possibly zone) which produces too many clashes
- for our hash table to handle well, and for coalescing we search by address.
- Thus an array of doubly-linked free lists seems preferable.
-\end_layout
-
-\begin_layout Standard
-There are various benefits in using per-size free lists (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-) but it's not clear this would reduce contention in the common case where
- all processes are allocating/freeing the same size.
- Thus we almost certainly need to divide in other ways: the most obvious
- is to divide the file into zones, and using a free list (or table of free
- lists) for each.
- This approximates address ordering.
-\end_layout
-
-\begin_layout Standard
-Unfortunately it is difficult to know what heuristics should be used to
- determine zone sizes, and our transaction code relies on being able to
- create a
-\begin_inset Quotes eld
-\end_inset
-
-recovery area
-\begin_inset Quotes erd
-\end_inset
-
- by simply appending to the file (difficult if it would need to create a
- new zone header).
- Thus we use a linked-list of free tables; currently we only ever create
- one, but if there is more than one we choose one at random to use.
- In future we may use heuristics to add new free tables on contention.
- We only expand the file when all free tables are exhausted.
-\end_layout
-
-\begin_layout Standard
-The basic algorithm is as follows.
- Freeing is simple:
-\end_layout
-
-\begin_layout Enumerate
-Identify the correct free list.
-\end_layout
-
-\begin_layout Enumerate
-Lock the corresponding list.
-\end_layout
-
-\begin_layout Enumerate
-Re-check the list (we didn't have a lock, sizes could have changed): relock
- if necessary.
-\end_layout
-
-\begin_layout Enumerate
-Place the freed entry in the list.
-\end_layout
-
-\begin_layout Standard
-Allocation is a little more complicated, as we perform delayed coalescing
- at this point:
-\end_layout
-
-\begin_layout Enumerate
-Pick a free table; usually the previous one.
-\end_layout
-
-\begin_layout Enumerate
-Lock the corresponding list.
-\end_layout
-
-\begin_layout Enumerate
-If the top entry is -large enough, remove it from the list and return it.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, coalesce entries in the list.If there was no entry large enough,
- unlock the list and try the next largest list
-\end_layout
-
-\begin_layout Enumerate
-If no list has an entry which meets our needs, try the next free table.
-\end_layout
-
-\begin_layout Enumerate
-If no zone satisfies, expand the file.
-\end_layout
-
-\begin_layout Standard
-This optimizes rapid insert/delete of free list entries by not coalescing
- them all the time..
- First-fit address ordering ordering seems to be fairly good for keeping
- fragmentation low (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-).
- Note that address ordering does not need a tailer to coalesce, though if
- we needed one we could have one cheaply: see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-
-\end_inset
-
-.
-
-\end_layout
-
-\begin_layout Standard
-Each free entry has the free table number in the header: less than 255.
- It also contains a doubly-linked list for easy deletion.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-TDB Becomes Fragmented
-\end_layout
-
-\begin_layout Standard
-Much of this is a result of allocation strategy
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
-xas.edu/pub/garbage/malloc/ismm98.ps
-\end_layout
-
-\end_inset
-
- and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
-on) is deliberately set at 25%, and external fragmentation is only cured
- by the decision to repack the entire db when a transaction commit needs
- to enlarge the file.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The 25% overhead on allocation works in practice for ldb because indexes
- tend to expand by one record at a time.
- This internal fragmentation can be resolved by having an
-\begin_inset Quotes eld
-\end_inset
-
-expanded
-\begin_inset Quotes erd
-\end_inset
-
- bit in the header to note entries that have previously expanded, and allocating
- more space for them.
-\end_layout
-
-\begin_layout Standard
-There are is a spectrum of possible solutions for external fragmentation:
- one is to use a fragmentation-avoiding allocation strategy such as best-fit
- address-order allocator.
- The other end of the spectrum would be to use a bump allocator (very fast
- and simple) and simply repack the file when we reach the end.
-\end_layout
-
-\begin_layout Standard
-There are three problems with efficient fragmentation-avoiding allocators:
- they are non-trivial, they tend to use a single free list for each size,
- and there's no evidence that tdb allocation patterns will match those recorded
- for general allocators (though it seems likely).
-\end_layout
-
-\begin_layout Standard
-Thus we don't spend too much effort on external fragmentation; we will be
- no worse than the current code if we need to repack on occasion.
- More effort is spent on reducing freelist contention, and reducing overhead.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Records-Incur-A"
-
-\end_inset
-
-Records Incur A 28-Byte Overhead
-\end_layout
-
-\begin_layout Standard
-Each TDB record has a header as follows:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_record {
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_off_t next; /* offset of the next record in the list */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t rec_len; /* total byte length of record */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t key_len; /* byte length of key */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t data_len; /* byte length of data */
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t full_hash; /* the full 32 bit hash of the key */
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t magic;   /* try to catch errors */
-\end_layout
-
-\begin_layout LyX-Code
-        /* the following union is implied:
-\end_layout
-
-\begin_layout LyX-Code
-                union {
-\end_layout
-
-\begin_layout LyX-Code
-                        char record[rec_len];
-\end_layout
-
-\begin_layout LyX-Code
-                        struct {
-\end_layout
-
-\begin_layout LyX-Code
-                                char key[key_len];
-\end_layout
-
-\begin_layout LyX-Code
-                                char data[data_len];
-\end_layout
-
-\begin_layout LyX-Code
-                        }
-\end_layout
-
-\begin_layout LyX-Code
-                        uint32_t totalsize; (tailer)
-\end_layout
-
-\begin_layout LyX-Code
-                }
-\end_layout
-
-\begin_layout LyX-Code
-        */
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-Naively, this would double to a 56-byte overhead on a 64 bit implementation.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We can use various techniques to reduce this for an allocated block:
-\end_layout
-
-\begin_layout Enumerate
-The 'next' pointer is not required, as we are using a flat hash table.
-\end_layout
-
-\begin_layout Enumerate
-'rec_len' can instead be expressed as an addition to key_len and data_len
- (it accounts for wasted or overallocated length in the record).
- Since the record length is always a multiple of 8, we can conveniently
- fit it in 32 bits (representing up to 35 bits).
-\end_layout
-
-\begin_layout Enumerate
-'key_len' and 'data_len' can be reduced.
- I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
- the two into one 64-bit field and using a 5 bit value which indicates at
- what bit to divide the two.
- Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
- size of 32 bits.
-\end_layout
-
-\begin_layout Enumerate
-'full_hash' is used to avoid a memcmp on the
-\begin_inset Quotes eld
-\end_inset
-
-miss
-\begin_inset Quotes erd
-\end_inset
-
- case, but this is diminishing returns after a handful of bits (at 10 bits,
- it reduces 99.9% of false memcmp).
- As an aside, as the lower bits are already incorporated in the hash table
- resolution, the upper bits should be used here.
- Note that it's not clear that these bits will be a win, given the extra
- bits in the hash table itself (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Hash-Size-Solution"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Enumerate
-'magic' does not need to be enlarged: it currently reflects one of 5 values
- (used, free, dead, recovery, and unused_recovery).
- It is useful for quick sanity checking however, and should not be eliminated.
-\end_layout
-
-\begin_layout Enumerate
-'tailer' is only used to coalesce free blocks (so a block to the right can
- find the header to check if this block is free).
- This can be replaced by a single 'free' bit in the header of the following
- block (and the tailer only exists in free blocks).
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-This technique from Thomas Standish.
- Data Structure Techniques.
- Addison-Wesley, Reading, Massachusetts, 1980.
-\end_layout
-
-\end_inset
-
- The current proposed coalescing algorithm doesn't need this, however.
-\end_layout
-
-\begin_layout Standard
-This produces a 16 byte used header like this:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_used_record {
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t used_magic : 16,
-\end_layout
-
-\begin_layout LyX-Code
-
-\end_layout
-
-\begin_layout LyX-Code
-                 key_data_divide: 5,
-\end_layout
-
-\begin_layout LyX-Code
-                 top_hash: 11;
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t extra_octets;
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t key_and_data_len;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-And a free record like this:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_free_record {
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t free_magic: 8,
-\end_layout
-
-\begin_layout LyX-Code
-                   prev : 56;
-\end_layout
-
-\begin_layout LyX-Code
-
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t free_table: 8,
-\end_layout
-
-\begin_layout LyX-Code
-                 total_length : 56
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t next;;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-Note that by limiting valid offsets to 56 bits, we can pack everything we
- need into 3 64-byte words, meaning our minimum record size is 8 bytes.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Transaction Commit Requires 4 fdatasync
-\end_layout
-
-\begin_layout Standard
-The current transaction algorithm is:
-\end_layout
-
-\begin_layout Enumerate
-write_recovery_data();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-write_recovery_header();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-overwrite_with_new_data();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-remove_recovery_header();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Standard
-On current ext3, each sync flushes all data to disk, so the next 3 syncs
- are relatively expensive.
- But this could become a performance bottleneck on other filesystems such
- as ext4.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Neil Brown points out that this is overzealous, and only one sync is needed:
-\end_layout
-
-\begin_layout Enumerate
-Bundle the recovery data, a transaction counter and a strong checksum of
- the new data.
-\end_layout
-
-\begin_layout Enumerate
-Strong checksum that whole bundle.
-\end_layout
-
-\begin_layout Enumerate
-Store the bundle in the database.
-\end_layout
-
-\begin_layout Enumerate
-Overwrite the oldest of the two recovery pointers in the header (identified
- using the transaction counter) with the offset of this bundle.
-\end_layout
-
-\begin_layout Enumerate
-sync.
-\end_layout
-
-\begin_layout Enumerate
-Write the new data to the file.
-\end_layout
-
-\begin_layout Standard
-Checking for recovery means identifying the latest bundle with a valid checksum
- and using the new data checksum to ensure that it has been applied.
- This is more expensive than the current check, but need only be done at
- open.
- For running databases, a separate header field can be used to indicate
- a transaction in progress; we need only check for recovery if this is set.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Does-Not"
-
-\end_inset
-
-TDB Does Not Have Snapshot Support
-\end_layout
-
-\begin_layout Subsubsection
-Proposed SolutionNone.
- At some point you say
-\begin_inset Quotes eld
-\end_inset
-
-use a real database
-\begin_inset Quotes erd
-\end_inset
-
- (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Standard
-But as a thought experiment, if we implemented transactions to only overwrite
- free entries (this is tricky: there must not be a header in each entry
- which indicates whether it is free, but use of presence in metadata elsewhere),
- and a pointer to the hash table, we could create an entirely new commit
- without destroying existing data.
- Then it would be easy to implement snapshots in a similar way.
-\end_layout
-
-\begin_layout Standard
-This would not allow arbitrary changes to the database, such as tdb_repack
- does, and would require more space (since we have to preserve the current
- and future entries at once).
- If we used hash trees rather than one big hash table, we might only have
- to rewrite some sections of the hash, too.
-\end_layout
-
-\begin_layout Standard
-We could then implement snapshots using a similar method, using multiple
- different hash tables/free tables.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-Transactions Cannot Operate in Parallel
-\end_layout
-
-\begin_layout Standard
-This would be useless for ldb, as it hits the index records with just about
- every update.
- It would add significant complexity in resolving clashes, and cause the
- all transaction callers to write their code to loop in the case where the
- transactions spuriously failed.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-
-\end_inset
-
-).
- We could solve a small part of the problem by providing read-only transactions.
- These would allow one write transaction to begin, but it could not commit
- until all r/o transactions are done.
- This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
- commit.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-Default Hash Function Is Suboptimal
-\end_layout
-
-\begin_layout Standard
-The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
- if we expand it to 64 bits), and works best when the hash bucket size is
- a prime number (which also means a slow modulus).
- In addition, it is highly predictable which could potentially lead to a
- Denial of Service attack in some TDB uses.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The Jenkins lookup3 hash
-\begin_inset Foot
-status open
-
-\begin_layout Plain Layout
-http://burtleburtle.net/bob/c/lookup3.c
-\end_layout
-
-\end_inset
-
- is a fast and superbly-mixing hash.
- It's used by the Linux kernel and almost everything else.
- This has the particular properties that it takes an initial seed, and produces
- two 32 bit hash numbers, which we can combine into a 64-bit hash.
-\end_layout
-
-\begin_layout Standard
-The seed should be created at tdb-creation time from some random source,
- and placed in the header.
- This is far from foolproof, but adds a little bit of protection against
- hash bombing.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Reliable-Traversal-Adds"
-
-\end_inset
-
-Reliable Traversal Adds Complexity
-\end_layout
-
-\begin_layout Standard
-We lock a record during traversal iteration, and try to grab that lock in
- the delete code.
- If that grab on delete fails, we simply mark it deleted and continue onwards;
- traversal checks for this condition and does the delete when it moves off
- the record.
-\end_layout
-
-\begin_layout Standard
-If traversal terminates, the dead record may be left indefinitely.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove reliability guarantees; see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "traverse-Proposed-Solution"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Fcntl Locking Adds Overhead
-\end_layout
-
-\begin_layout Standard
-Placing a fcntl lock means a system call, as does removing one.
- This is actually one reason why transactions can be faster (everything
- is locked once at transaction start).
- In the uncontended case, this overhead can theoretically be eliminated.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
-\end_layout
-
-\begin_layout Standard
-We tried this before with spinlock support, in the early days of TDB, and
- it didn't make much difference except in manufactured benchmarks.
-\end_layout
-
-\begin_layout Standard
-We could use spinlocks (with futex kernel support under Linux), but it means
- that we lose automatic cleanup when a process dies with a lock.
- There is a method of auto-cleanup under Linux, but it's not supported by
- other operating systems.
- We could reintroduce a clear-if-first-style lock and sweep for dead futexes
- on open, but that wouldn't help the normal case of one concurrent opener
- dying.
- Increasingly elaborate repair schemes could be considered, but they require
- an ABI change (everyone must use them) anyway, so there's no need to do
- this at the same time as everything else.
-\end_layout
-
-\begin_layout Subsection
-Some Transactions Don't Require Durability
-\end_layout
-
-\begin_layout Standard
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
- usage, and occasionally empties the results into a transactional TDB.
- This kind of usage prioritizes performance over durability: as long as
- we are consistent, data can be lost.
-\end_layout
-
-\begin_layout Standard
-This would be more neatly implemented inside tdb: a
-\begin_inset Quotes eld
-\end_inset
-
-soft
-\begin_inset Quotes erd
-\end_inset
-
- transaction commit (ie.
- syncless) which meant that data may be reverted on a crash.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
-\end_layout
-
-\begin_layout Standard
-Unfortunately any transaction scheme which overwrites old data requires
- a sync before that overwrite to avoid the possibility of corruption.
-\end_layout
-
-\begin_layout Standard
-It seems possible to use a scheme similar to that described in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Does-Not"
-
-\end_inset
-
-,where transactions are committed without overwriting existing data, and
- an array of top-level pointers were available in the header.
- If the transaction is
-\begin_inset Quotes eld
-\end_inset
-
-soft
-\begin_inset Quotes erd
-\end_inset
-
- then we would not need a sync at all: existing processes would pick up
- the new hash table and free list and work with that.
-\end_layout
-
-\begin_layout Standard
-At some later point, a sync would allow recovery of the old data into the
- free lists (perhaps when the array of top-level pointers filled).
- On crash, tdb_open() would examine the array of top levels, and apply the
- transactions until it encountered an invalid checksum.
-\end_layout
-
-\begin_layout Subsection
-Tracing Is Fragile, Replay Is External
-\end_layout
-
-\begin_layout Standard
-The current TDB has compile-time-enabled tracing code, but it often breaks
- as it is not enabled by default.
- In a similar way, the ctdb code has an external wrapper which does replay
- tracing so it can coordinate cluster-wide transactions.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "replay-attribute"
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-Tridge points out that an attribute can be later added to tdb_open (see
-
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-
-\end_inset
-
-) to provide replay/trace hooks, which could become the basis for this and
- future parallel transactions and snapshot support.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\end_body
-\end_document
diff --git a/lib/tdb2/doc/design.lyx,v b/lib/tdb2/doc/design.lyx,v
deleted file mode 100644
index 13e6387f7f..0000000000
--- a/lib/tdb2/doc/design.lyx,v
+++ /dev/null
@@ -1,4679 +0,0 @@
-head	1.13;
-access;
-symbols;
-locks; strict;
-comment	@# @;
-
-
-1.13
-date	2011.03.01.11.46.54;	author rusty;	state Exp;
-branches;
-next	1.12;
-
-1.12
-date	2010.12.01.12.20.49;	author rusty;	state Exp;
-branches;
-next	1.11;
-
-1.11
-date	2010.12.01.11.55.20;	author rusty;	state Exp;
-branches;
-next	1.10;
-
-1.10
-date	2010.09.14.00.33.57;	author rusty;	state Exp;
-branches;
-next	1.9;
-
-1.9
-date	2010.09.09.07.25.12;	author rusty;	state Exp;
-branches;
-next	1.8;
-
-1.8
-date	2010.09.02.02.29.05;	author rusty;	state Exp;
-branches;
-next	1.7;
-
-1.7
-date	2010.09.01.10.58.12;	author rusty;	state Exp;
-branches;
-next	1.6;
-
-1.6
-date	2010.08.02.00.21.43;	author rusty;	state Exp;
-branches;
-next	1.5;
-
-1.5
-date	2010.08.02.00.21.16;	author rusty;	state Exp;
-branches;
-next	1.4;
-
-1.4
-date	2010.05.10.13.09.11;	author rusty;	state Exp;
-branches;
-next	1.3;
-
-1.3
-date	2010.05.10.11.58.37;	author rusty;	state Exp;
-branches;
-next	1.2;
-
-1.2
-date	2010.05.10.05.35.13;	author rusty;	state Exp;
-branches;
-next	1.1;
-
-1.1
-date	2010.05.04.02.29.16;	author rusty;	state Exp;
-branches;
-next	;
-
-
-desc
-@First draft
-@
-
-
-1.13
-log
-@Thread-safe API
-@
-text
-@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
-\lyxformat 345
-\begin_document
-\begin_header
-\textclass article
-\use_default_options true
-\language english
-\inputencoding auto
-\font_roman default
-\font_sans default
-\font_typewriter default
-\font_default_family default
-\font_sc false
-\font_osf false
-\font_sf_scale 100
-\font_tt_scale 100
-
-\graphics default
-\paperfontsize default
-\use_hyperref false
-\papersize default
-\use_geometry false
-\use_amsmath 1
-\use_esint 1
-\cite_engine basic
-\use_bibtopic false
-\paperorientation portrait
-\secnumdepth 3
-\tocdepth 3
-\paragraph_separation indent
-\defskip medskip
-\quotes_language english
-\papercolumns 1
-\papersides 1
-\paperpagestyle default
-\tracking_changes true
-\output_changes true
-\author "Rusty Russell,,,"
-\author ""
-\end_header
-
-\begin_body
-
-\begin_layout Title
-TDB2: A Redesigning The Trivial DataBase
-\end_layout
-
-\begin_layout Author
-Rusty Russell, IBM Corporation
-\end_layout
-
-\begin_layout Date
-1-December-2010
-\end_layout
-
-\begin_layout Abstract
-The Trivial DataBase on-disk format is 32 bits; with usage cases heading
- towards the 4G limit, that must change.
- This required breakage provides an opportunity to revisit TDB's other design
- decisions and reassess them.
-\end_layout
-
-\begin_layout Section
-Introduction
-\end_layout
-
-\begin_layout Standard
-The Trivial DataBase was originally written by Andrew Tridgell as a simple
- key/data pair storage system with the same API as dbm, but allowing multiple
- readers and writers while being small enough (< 1000 lines of C) to include
- in SAMBA.
- The simple design created in 1999 has proven surprisingly robust and performant
-, used in Samba versions 3 and 4 as well as numerous other projects.
- Its useful life was greatly increased by the (backwards-compatible!) addition
- of transaction support in 2005.
-\end_layout
-
-\begin_layout Standard
-The wider variety and greater demands of TDB-using code has lead to some
- organic growth of the API, as well as some compromises on the implementation.
- None of these, by themselves, are seen as show-stoppers, but the cumulative
- effect is to a loss of elegance over the initial, simple TDB implementation.
- Here is a table of the approximate number of lines of implementation code
- and number of API functions at the end of each year:
-\end_layout
-
-\begin_layout Standard
-\begin_inset Tabular
-<lyxtabular version="3" rows="12" columns="3">
-<features>
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<row>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-Year End
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-API Functions
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-Lines of C Code Implementation
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1999
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-13
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1195
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2000
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-24
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1725
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2001
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-32
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2228
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2002
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-35
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2481
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2003
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-35
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2552
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2004
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-40
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2584
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2005
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-38
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2647
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2006
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-52
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-3754
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2007
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-66
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-4398
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2008
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-71
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-4768
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2009
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-73
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-5715
-\end_layout
-
-\end_inset
-</cell>
-</row>
-</lyxtabular>
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-This review is an attempt to catalog and address all the known issues with
- TDB and create solutions which address the problems without significantly
- increasing complexity; all involved are far too aware of the dangers of
- second system syndrome in rewriting a successful project like this.
-\end_layout
-
-\begin_layout Section
-API Issues
-\end_layout
-
-\begin_layout Subsection
-tdb_open_ex Is Not Expandable
-\end_layout
-
-\begin_layout Standard
-The tdb_open() call was expanded to tdb_open_ex(), which added an optional
- hashing function and an optional logging function argument.
- Additional arguments to open would require the introduction of a tdb_open_ex2
- call etc.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "attributes"
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-tdb_open() will take a linked-list of attributes:
-\end_layout
-
-\begin_layout LyX-Code
-enum tdb_attribute {
-\end_layout
-
-\begin_layout LyX-Code
-    TDB_ATTRIBUTE_LOG = 0,
-\end_layout
-
-\begin_layout LyX-Code
-    TDB_ATTRIBUTE_HASH = 1
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_base {
-\end_layout
-
-\begin_layout LyX-Code
-    enum tdb_attribute attr;
-\end_layout
-
-\begin_layout LyX-Code
-    union tdb_attribute *next;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_log {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
-\end_layout
-
-\begin_layout LyX-Code
-    tdb_log_func log_fn;
-\end_layout
-
-\begin_layout LyX-Code
-    void *log_private;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_hash {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
-\end_layout
-
-\begin_layout LyX-Code
-    tdb_hash_func hash_fn;
-\end_layout
-
-\begin_layout LyX-Code
-    void *hash_private;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-union tdb_attribute {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base;
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_log log;
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_hash hash;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-This allows future attributes to be added, even if this expands the size
- of the union.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_traverse Makes Impossible Guarantees
-\end_layout
-
-\begin_layout Standard
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
- was thought that it was important to guarantee that all records which exist
- at the start and end of the traversal would be included, and no record
- would be included twice.
-\end_layout
-
-\begin_layout Standard
-This adds complexity (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Reliable-Traversal-Adds"
-
-\end_inset
-
-) and does not work anyway for records which are altered (in particular,
- those which are expanded may be effectively deleted and re-added behind
- the traversal).
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "traverse-Proposed-Solution"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Abandon the guarantee.
- You will see every record if no changes occur during your traversal, otherwise
- you will see some subset.
- You can prevent changes by using a transaction or the locking API.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- Delete-during-traverse will still delete every record, too (assuming no
- other changes).
-\end_layout
-
-\begin_layout Subsection
-Nesting of Transactions Is Fraught
-\end_layout
-
-\begin_layout Standard
-TDB has alternated between allowing nested transactions and not allowing
- them.
- Various paths in the Samba codebase assume that transactions will nest,
- and in a sense they can: the operation is only committed to disk when the
- outer transaction is committed.
- There are two problems, however:
-\end_layout
-
-\begin_layout Enumerate
-Canceling the inner transaction will cause the outer transaction commit
- to fail, and will not undo any operations since the inner transaction began.
- This problem is soluble with some additional internal code.
-\end_layout
-
-\begin_layout Enumerate
-An inner transaction commit can be cancelled by the outer transaction.
- This is desirable in the way which Samba's database initialization code
- uses transactions, but could be a surprise to any users expecting a successful
- transaction commit to expose changes to others.
-\end_layout
-
-\begin_layout Standard
-The current solution is to specify the behavior at tdb_open(), with the
- default currently that nested transactions are allowed.
- This flag can also be changed at runtime.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Given the usage patterns, it seems that the
-\begin_inset Quotes eld
-\end_inset
-
-least-surprise
-\begin_inset Quotes erd
-\end_inset
-
- behavior of disallowing nested transactions should become the default.
- Additionally, it seems the outer transaction is the only code which knows
- whether inner transactions should be allowed, so a flag to indicate this
- could be added to tdb_transaction_start.
- However, this behavior can be simulated with a wrapper which uses tdb_add_flags
-() and tdb_remove_flags(), so the API should not be expanded for this relatively
--obscure case.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1298979572
-Incomplete; nesting flag is still defined as per tdb1.
-\change_inserted 0 1298979584
-Complete; the nesting flag has been removed.
-\change_unchanged
-
-\end_layout
-
-\begin_layout Subsection
-Incorrect Hash Function is Not Detected
-\end_layout
-
-\begin_layout Standard
-tdb_open_ex() allows the calling code to specify a different hash function
- to use, but does not check that all other processes accessing this tdb
- are using the same hash function.
- The result is that records are missing from tdb_fetch().
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The header should contain an example hash result (eg.
- the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
- hash function produces the same answer, or fail the tdb_open call.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-\end_layout
-
-\begin_layout Standard
-In response to scalability issues with the free list (
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Freelist-Is"
-
-\end_inset
-
-) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
- and the TDB_VOLATILE flag to tdb_open.
- The latter actually calls the former with an argument of
-\begin_inset Quotes eld
-\end_inset
-
-5
-\begin_inset Quotes erd
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Standard
-This code allows deleted records to accumulate without putting them in the
- free list.
- On delete we iterate through each chain and free them in a batch if there
- are more than max_dead entries.
- These are never otherwise recycled except as a side-effect of a tdb_repack.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-With the scalability problems of the freelist solved, this API can be removed.
- The TDB_VOLATILE flag may still be useful as a hint that store and delete
- of records will be at least as common as fetch in order to allow some internal
- tuning, but initially will become a no-op.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
- TDB_VOLATILE still defined, but implementation should fail on unknown flags
- to be future-proof.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Files-Cannot"
-
-\end_inset
-
-TDB Files Cannot Be Opened Multiple Times In The Same Process
-\end_layout
-
-\begin_layout Standard
-No process can open the same TDB twice; we check and disallow it.
- This is an unfortunate side-effect of fcntl locks, which operate on a per-file
- rather than per-file-descriptor basis, and do not nest.
- Thus, closing any file descriptor on a file clears all the locks obtained
- by this process, even if they were placed using a different file descriptor!
-\end_layout
-
-\begin_layout Standard
-Note that even if this were solved, deadlock could occur if operations were
- nested: this is a more manageable programming error in most cases.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We could lobby POSIX to fix the perverse rules, or at least lobby Linux
- to violate them so that the most common implementation does not have this
- restriction.
- This would be a generally good idea for other fcntl lock users.
-\end_layout
-
-\begin_layout Standard
-Samba uses a wrapper which hands out the same tdb_context to multiple callers
- if this happens, and does simple reference counting.
- We should do this inside the tdb library, which already emulates lock nesting
- internally; it would need to recognize when deadlock occurs within a single
- process.
- This would create a new failure mode for tdb operations (while we currently
- handle locking failures, they are impossible in normal use and a process
- encountering them can do little but give up).
-\end_layout
-
-\begin_layout Standard
-I do not see benefit in an additional tdb_open flag to indicate whether
- re-opening is allowed, as though there may be some benefit to adding a
- call to detect when a tdb_context is shared, to allow other to create such
- an API.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
-\end_layout
-
-\begin_layout Subsection
-TDB API Is Not POSIX Thread-safe
-\end_layout
-
-\begin_layout Standard
-The TDB API uses an error code which can be queried after an operation to
- determine what went wrong.
- This programming model does not work with threads, unless specific additional
- guarantees are given by the implementation.
- In addition, even otherwise-independent threads cannot open the same TDB
- (as in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Files-Cannot"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Reachitecting the API to include a tdb_errcode pointer would be a great
- deal of churn
-\change_inserted 0 1298979557
-, but fortunately most functions return 0 on success and -1 on error: we
- can change these to return 0 on success and a negative error code on error,
- and the API remains similar to previous.
- The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
- pointer and return an error code.
- It is also simpler to have tdb_nextkey replace its key argument in place,
- freeing up any old .dptr.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1298979438
-; we are better to guarantee that the tdb_errcode is per-thread so the current
- programming model can be maintained.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1298979438
-This requires dynamic per-thread allocations, which is awkward with POSIX
- threads (pthread_key_create space is limited and we cannot simply allocate
- a key for every TDB).
-\change_unchanged
-
-\end_layout
-
-\begin_layout Standard
-Internal locking is required to make sure that fcntl locks do not overlap
- between threads, and also that the global list of tdbs is maintained.
-\end_layout
-
-\begin_layout Standard
-The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
- version of the library, and otherwise no overhead will exist.
- Alternatively, a hooking mechanism similar to that proposed for
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
-
-\end_inset
-
- could be used to enable pthread locking at runtime.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete
-\change_inserted 0 1298979681
-; API has been changed but thread safety has not been implemented.
-\change_deleted 0 1298979669
-.
-\change_unchanged
-
-\end_layout
-
-\begin_layout Subsection
-*_nonblock Functions And *_mark Functions Expose Implementation
-\end_layout
-
-\begin_layout Standard
-CTDB
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-Clustered TDB, see http://ctdb.samba.org
-\end_layout
-
-\end_inset
-
- wishes to operate on TDB in a non-blocking manner.
- This is currently done as follows:
-\end_layout
-
-\begin_layout Enumerate
-Call the _nonblock variant of an API function (eg.
- tdb_lockall_nonblock).
- If this fails:
-\end_layout
-
-\begin_layout Enumerate
-Fork a child process, and wait for it to call the normal variant (eg.
- tdb_lockall).
-\end_layout
-
-\begin_layout Enumerate
-If the child succeeds, call the _mark variant to indicate we already have
- the locks (eg.
- tdb_lockall_mark).
-\end_layout
-
-\begin_layout Enumerate
-Upon completion, tell the child to release the locks (eg.
- tdb_unlockall).
-\end_layout
-
-\begin_layout Enumerate
-Indicate to tdb that it should consider the locks removed (eg.
- tdb_unlockall_mark).
-\end_layout
-
-\begin_layout Standard
-There are several issues with this approach.
- Firstly, adding two new variants of each function clutters the API for
- an obscure use, and so not all functions have three variants.
- Secondly, it assumes that all paths of the functions ask for the same locks,
- otherwise the parent process will have to get a lock which the child doesn't
- have under some circumstances.
- I don't believe this is currently the case, but it constrains the implementatio
-n.
-
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Proposed-Solution-locking-hook"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Implement a hook for locking methods, so that the caller can control the
- calls to create and remove fcntl locks.
- In this scenario, ctdbd would operate as follows:
-\end_layout
-
-\begin_layout Enumerate
-Call the normal API function, eg tdb_lockall().
-\end_layout
-
-\begin_layout Enumerate
-When the lock callback comes in, check if the child has the lock.
- Initially, this is always false.
- If so, return 0.
- Otherwise, try to obtain it in non-blocking mode.
- If that fails, return EWOULDBLOCK.
-\end_layout
-
-\begin_layout Enumerate
-Release locks in the unlock callback as normal.
-\end_layout
-
-\begin_layout Enumerate
-If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
- child to repeat the operation.
-\end_layout
-
-\begin_layout Enumerate
-The child records what locks it obtains, and returns that information to
- the parent.
-\end_layout
-
-\begin_layout Enumerate
-When the child has succeeded, goto 1.
-\end_layout
-
-\begin_layout Standard
-This is flexible enough to handle any potential locking scenario, even when
- lock requirements change.
- It can be optimized so that the parent does not release locks, just tells
- the child which locks it doesn't need to obtain.
-\end_layout
-
-\begin_layout Standard
-It also keeps the complexity out of the API, and in ctdbd where it is needed.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
-\end_layout
-
-\begin_layout Subsection
-tdb_chainlock Functions Expose Implementation
-\end_layout
-
-\begin_layout Standard
-tdb_chainlock locks some number of records, including the record indicated
- by the given key.
- This gave atomicity guarantees; no-one can start a transaction, alter,
- read or delete that key while the lock is held.
-\end_layout
-
-\begin_layout Standard
-It also makes the same guarantee for any other key in the chain, which is
- an internal implementation detail and potentially a cause for deadlock.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- It would be nice to have an explicit single entry lock which effected no
- other keys.
- Unfortunately, this won't work for an entry which doesn't exist.
- Thus while chainlock may be implemented more efficiently for the existing
- case, it will still have overlap issues with the non-existing case.
- So it is best to keep the current (lack of) guarantee about which records
- will be effected to avoid constraining our implementation.
-\end_layout
-
-\begin_layout Subsection
-Signal Handling is Not Race-Free
-\end_layout
-
-\begin_layout Standard
-The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
- that the tdb locking code should return with a failure, rather than trying
- again when a signal is received (and errno == EAGAIN).
- This is usually used to implement timeouts.
-\end_layout
-
-\begin_layout Standard
-Unfortunately, this does not work in the case where the signal is received
- before the tdb code enters the fcntl() call to place the lock: the code
- will sleep within the fcntl() code, unaware that the signal wants it to
- exit.
- In the case of long timeouts, this does not happen in practice.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The locking hooks proposed in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
-
-\end_inset
-
- would allow the user to decide on whether to fail the lock acquisition
- on a signal.
- This allows the caller to choose their own compromise: they could narrow
- the race by checking immediately before the fcntl call.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-It may be possible to make this race-free in some implementations by having
- the signal handler alter the struct flock to make it invalid.
- This will cause the fcntl() lock call to fail with EINVAL if the signal
- occurs before the kernel is entered, otherwise EAGAIN.
-\end_layout
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
-\end_layout
-
-\begin_layout Subsection
-The API Uses Gratuitous Typedefs, Capitals
-\end_layout
-
-\begin_layout Standard
-typedefs are useful for providing source compatibility when types can differ
- across implementations, or arguably in the case of function pointer definitions
- which are hard for humans to parse.
- Otherwise it is simply obfuscation and pollutes the namespace.
-\end_layout
-
-\begin_layout Standard
-Capitalization is usually reserved for compile-time constants and macros.
-\end_layout
-
-\begin_layout Description
-TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
- definition isn't visible to the API user anyway.
-\end_layout
-
-\begin_layout Description
-TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
- needs to be understood by the API user.
-\end_layout
-
-\begin_layout Description
-struct
-\begin_inset space ~
-\end_inset
-
-TDB_DATA This would normally be called 'struct tdb_data'.
-\end_layout
-
-\begin_layout Description
-enum
-\begin_inset space ~
-\end_inset
-
-TDB_ERROR Similarly, this would normally be enum tdb_error.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- Introducing lower case variants would please pedants like myself, but if
- it were done the existing ones should be kept.
- There is little point forcing a purely cosmetic change upon tdb users.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "tdb_log_func-Doesnt-Take"
-
-\end_inset
-
-tdb_log_func Doesn't Take The Private Pointer
-\end_layout
-
-\begin_layout Standard
-For API compatibility reasons, the logging function needs to call tdb_get_loggin
-g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-It should simply take an extra argument, since we are prepared to break
- the API/ABI.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Various Callback Functions Are Not Typesafe
-\end_layout
-
-\begin_layout Standard
-The callback functions in tdb_set_logging_function (after
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "tdb_log_func-Doesnt-Take"
-
-\end_inset
-
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
- all take void * and must internally convert it to the argument type they
- were expecting.
-\end_layout
-
-\begin_layout Standard
-If this type changes, the compiler will not produce warnings on the callers,
- since it only sees void *.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-With careful use of macros, we can create callback functions which give
- a warning when used on gcc and the types of the callback and its private
- argument differ.
- Unsupported compilers will not give a warning, which is no worse than now.
- In addition, the callbacks become clearer, as they need not use void *
- for their parameter.
-\end_layout
-
-\begin_layout Standard
-See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
-\end_layout
-
-\begin_layout Subsection
-TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
-\end_layout
-
-\begin_layout Standard
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
- be cleared if the caller discovers it is the only process with the TDB
- open.
- However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
- be detected, so will have the TDB erased underneath them (usually resulting
- in a crash).
-\end_layout
-
-\begin_layout Standard
-There is a similar issue on fork(); if the parent exits (or otherwise closes
- the tdb) before the child calls tdb_reopen_all() to establish the lock
- used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
- at that moment will believe it alone has opened the TDB and will erase
- it.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove TDB_CLEAR_IF_FIRST.
- Other workarounds are possible, but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1298979699
-Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
-\change_inserted 0 1298979700
-Complete.
-\change_unchanged
-
-\end_layout
-
-\begin_layout Subsection
-Extending The Header Is Difficult
-\end_layout
-
-\begin_layout Standard
-We have reserved (zeroed) words in the TDB header, which can be used for
- future features.
- If the future features are compulsory, the version number must be updated
- to prevent old code from accessing the database.
- But if the future feature is optional, we have no way of telling if older
- code is accessing the database or not.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The header should contain a
-\begin_inset Quotes eld
-\end_inset
-
-format variant
-\begin_inset Quotes erd
-\end_inset
-
- value (64-bit).
- This is divided into two 32-bit parts:
-\end_layout
-
-\begin_layout Enumerate
-The lower part reflects the format variant understood by code accessing
- the database.
-\end_layout
-
-\begin_layout Enumerate
-The upper part reflects the format variant you must understand to write
- to the database (otherwise you can only open for reading).
-\end_layout
-
-\begin_layout Standard
-The latter field can only be written at creation time, the former should
- be written under the OPEN_LOCK when opening the database for writing, if
- the variant of the code is lower than the current lowest variant.
-\end_layout
-
-\begin_layout Standard
-This should allow backwards-compatible features to be added, and detection
- if older code (which doesn't understand the feature) writes to the database.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
-\end_layout
-
-\begin_layout Subsection
-Record Headers Are Not Expandible
-\end_layout
-
-\begin_layout Standard
-If we later want to add (say) checksums on keys and data, it would require
- another format change, which we'd like to avoid.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We often have extra padding at the tail of a record.
- If we ensure that the first byte (if any) of this padding is zero, we will
- have a way for future changes to detect code which doesn't understand a
- new format: the new code would write (say) a 1 at the tail, and thus if
- there is no tail or the first byte is 0, we would know the extension is
- not present on that record.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete.
-\end_layout
-
-\begin_layout Subsection
-TDB Does Not Use Talloc
-\end_layout
-
-\begin_layout Standard
-Many users of TDB (particularly Samba) use the talloc allocator, and thus
- have to wrap TDB in a talloc context to use it conveniently.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The allocation within TDB is not complicated enough to justify the use of
- talloc, and I am reluctant to force another (excellent) library on TDB
- users.
- Nonetheless a compromise is possible.
- An attribute (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-
-\end_inset
-
-) can be added later to tdb_open() to provide an alternate allocation mechanism,
- specifically for talloc but usable by any other allocator (which would
- ignore the
-\begin_inset Quotes eld
-\end_inset
-
-context
-\begin_inset Quotes erd
-\end_inset
-
- argument).
-\end_layout
-
-\begin_layout Standard
-This would form a talloc heirarchy as expected, but the caller would still
- have to attach a destructor to the tdb context returned from tdb_open to
- close it.
- All TDB_DATA fields would be children of the tdb_context, and the caller
- would still have to manage them (using talloc_free() or talloc_steal()).
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Section
-Performance And Scalability Issues
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-
-\end_inset
-
-TDB_CLEAR_IF_FIRST Imposes Performance Penalty
-\end_layout
-
-\begin_layout Standard
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
- 4 (aka.
- the ACTIVE_LOCK).
- While these locks never conflict in normal tdb usage, they do add substantial
- overhead for most fcntl lock implementations when the kernel scans to detect
- if a lock conflict exists.
- This is often a single linked list, making the time to acquire and release
- a fcntl lock O(N) where N is the number of processes with the TDB open,
- not the number actually doing work.
-\end_layout
-
-\begin_layout Standard
-In a Samba server it is common to have huge numbers of clients sitting idle,
- and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-There is a flag to tdb_reopen_all() which is used for this optimization:
- if the parent process will outlive the child, the child does not need the
- ACTIVE_LOCK.
- This is a workaround for this very performance issue.
-\end_layout
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove the flag.
- It was a neat idea, but even trivial servers tend to know when they are
- initializing for the first time and can simply unlink the old tdb at that
- point.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1298979837
-Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
-\change_inserted 0 1298979837
-Complete.
-\change_unchanged
-
-\end_layout
-
-\begin_layout Subsection
-TDB Files Have a 4G Limit
-\end_layout
-
-\begin_layout Standard
-This seems to be becoming an issue (so much for
-\begin_inset Quotes eld
-\end_inset
-
-trivial
-\begin_inset Quotes erd
-\end_inset
-
-!), particularly for ldb.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-A new, incompatible TDB format which uses 64 bit offsets internally rather
- than 32 bit as now.
- For simplicity of endian conversion (which TDB does on the fly if required),
- all values will be 64 bit on disk.
- In practice, some upper bits may be used for other purposes, but at least
- 56 bits will be available for file offsets.
-\end_layout
-
-\begin_layout Standard
-tdb_open() will automatically detect the old version, and even create them
- if TDB_VERSION6 is specified to tdb_open.
-\end_layout
-
-\begin_layout Standard
-32 bit processes will still be able to access TDBs larger than 4G (assuming
- that their off_t allows them to seek to 64 bits), they will gracefully
- fall back as they fail to mmap.
- This can happen already with large TDBs.
-\end_layout
-
-\begin_layout Standard
-Old versions of tdb will fail to open the new TDB files (since 28 August
- 2009, commit 398d0c29290: prior to that any unrecognized file format would
- be erased and initialized as a fresh tdb!)
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Records Have a 4G Limit
-\end_layout
-
-\begin_layout Standard
-This has not been a reported problem, and the API uses size_t which can
- be 64 bit on 64 bit platforms.
- However, other limits may have made such an issue moot.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Record sizes will be 64 bit, with an error returned on 32 bit platforms
- which try to access such records (the current implementation would return
- TDB_ERR_OOM in a similar case).
- It seems unlikely that 32 bit keys will be a limitation, so the implementation
- may not support this (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Hash Size Is Determined At TDB Creation Time
-\end_layout
-
-\begin_layout Standard
-TDB contains a number of hash chains in the header; the number is specified
- at creation time, and defaults to 131.
- This is such a bottleneck on large databases (as each hash chain gets quite
- long), that LDB uses 10,000 for this hash.
- In general it is impossible to know what the 'right' answer is at database
- creation time.
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Hash-Size-Solution"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-After comprehensive performance testing on various scalable hash variants
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
- because I was previously convinced that an expanding tree of hashes would
- be very close to optimal.
-\end_layout
-
-\end_inset
-
-, it became clear that it is hard to beat a straight linear hash table which
- doubles in size when it reaches saturation.
- Unfortunately, altering the hash table introduces serious locking complications
-: the entire hash table needs to be locked to enlarge the hash table, and
- others might be holding locks.
- Particularly insidious are insertions done under tdb_chainlock.
-\end_layout
-
-\begin_layout Standard
-Thus an expanding layered hash will be used: an array of hash groups, with
- each hash group exploding into pointers to lower hash groups once it fills,
- turning into a hash tree.
- This has implications for locking: we must lock the entire group in case
- we need to expand it, yet we don't know how deep the tree is at that point.
-\end_layout
-
-\begin_layout Standard
-Note that bits from the hash table entries should be stolen to hold more
- hash bits to reduce the penalty of collisions.
- We can use the otherwise-unused lower 3 bits.
- If we limit the size of the database to 64 exabytes, we can use the top
- 8 bits of the hash entry as well.
- These 11 bits would reduce false positives down to 1 in 2000 which is more
- than we need: we can use one of the bits to indicate that the extra hash
- bits are valid.
- This means we can choose not to re-hash all entries when we expand a hash
- group; simply use the next bits we need and mark them invalid.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Freelist-Is"
-
-\end_inset
-
-TDB Freelist Is Highly Contended
-\end_layout
-
-\begin_layout Standard
-TDB uses a single linked list for the free list.
- Allocation occurs as follows, using heuristics which have evolved over
- time:
-\end_layout
-
-\begin_layout Enumerate
-Get the free list lock for this whole operation.
-\end_layout
-
-\begin_layout Enumerate
-Multiply length by 1.25, so we always over-allocate by 25%.
-\end_layout
-
-\begin_layout Enumerate
-Set the slack multiplier to 1.
-\end_layout
-
-\begin_layout Enumerate
-Examine the current freelist entry: if it is > length but < the current
- best case, remember it as the best case.
-\end_layout
-
-\begin_layout Enumerate
-Multiply the slack multiplier by 1.05.
-\end_layout
-
-\begin_layout Enumerate
-If our best fit so far is less than length * slack multiplier, return it.
- The slack will be turned into a new free record if it's large enough.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, go onto the next freelist entry.
-\end_layout
-
-\begin_layout Standard
-Deleting a record occurs as follows:
-\end_layout
-
-\begin_layout Enumerate
-Lock the hash chain for this whole operation.
-\end_layout
-
-\begin_layout Enumerate
-Walk the chain to find the record, keeping the prev pointer offset.
-\end_layout
-
-\begin_layout Enumerate
-If max_dead is non-zero:
-\end_layout
-
-\begin_deeper
-\begin_layout Enumerate
-Walk the hash chain again and count the dead records.
-\end_layout
-
-\begin_layout Enumerate
-If it's more than max_dead, bulk free all the dead ones (similar to steps
- 4 and below, but the lock is only obtained once).
-\end_layout
-
-\begin_layout Enumerate
-Simply mark this record as dead and return.
-
-\end_layout
-
-\end_deeper
-\begin_layout Enumerate
-Get the free list lock for the remainder of this operation.
-\end_layout
-
-\begin_layout Enumerate
-\begin_inset CommandInset label
-LatexCommand label
-name "right-merging"
-
-\end_inset
-
-Examine the following block to see if it is free; if so, enlarge the current
- block and remove that block from the free list.
- This was disabled, as removal from the free list was O(entries-in-free-list).
-\end_layout
-
-\begin_layout Enumerate
-Examine the preceeding block to see if it is free: for this reason, each
- block has a 32-bit tailer which indicates its length.
- If it is free, expand it to cover our new block and return.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, prepend ourselves to the free list.
-\end_layout
-
-\begin_layout Standard
-Disabling right-merging (step
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "right-merging"
-
-\end_inset
-
-) causes fragmentation; the other heuristics proved insufficient to address
- this, so the final answer to this was that when we expand the TDB file
- inside a transaction commit, we repack the entire tdb.
-\end_layout
-
-\begin_layout Standard
-The single list lock limits our allocation rate; due to the other issues
- this is not currently seen as a bottleneck.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The first step is to remove all the current heuristics, as they obviously
- interact, then examine them once the lock contention is addressed.
-\end_layout
-
-\begin_layout Standard
-The free list must be split to reduce contention.
- Assuming perfect free merging, we can at most have 1 free list entry for
- each entry.
- This implies that the number of free lists is related to the size of the
- hash table, but as it is rare to walk a large number of free list entries
- we can use far fewer, say 1/32 of the number of hash buckets.
-\end_layout
-
-\begin_layout Standard
-It seems tempting to try to reuse the hash implementation which we use for
- records here, but we have two ways of searching for free entries: for allocatio
-n we search by size (and possibly zone) which produces too many clashes
- for our hash table to handle well, and for coalescing we search by address.
- Thus an array of doubly-linked free lists seems preferable.
-\end_layout
-
-\begin_layout Standard
-There are various benefits in using per-size free lists (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-) but it's not clear this would reduce contention in the common case where
- all processes are allocating/freeing the same size.
- Thus we almost certainly need to divide in other ways: the most obvious
- is to divide the file into zones, and using a free list (or table of free
- lists) for each.
- This approximates address ordering.
-\end_layout
-
-\begin_layout Standard
-Unfortunately it is difficult to know what heuristics should be used to
- determine zone sizes, and our transaction code relies on being able to
- create a
-\begin_inset Quotes eld
-\end_inset
-
-recovery area
-\begin_inset Quotes erd
-\end_inset
-
- by simply appending to the file (difficult if it would need to create a
- new zone header).
- Thus we use a linked-list of free tables; currently we only ever create
- one, but if there is more than one we choose one at random to use.
- In future we may use heuristics to add new free tables on contention.
- We only expand the file when all free tables are exhausted.
-\end_layout
-
-\begin_layout Standard
-The basic algorithm is as follows.
- Freeing is simple:
-\end_layout
-
-\begin_layout Enumerate
-Identify the correct free list.
-\end_layout
-
-\begin_layout Enumerate
-Lock the corresponding list.
-\end_layout
-
-\begin_layout Enumerate
-Re-check the list (we didn't have a lock, sizes could have changed): relock
- if necessary.
-\end_layout
-
-\begin_layout Enumerate
-Place the freed entry in the list.
-\end_layout
-
-\begin_layout Standard
-Allocation is a little more complicated, as we perform delayed coalescing
- at this point:
-\end_layout
-
-\begin_layout Enumerate
-Pick a free table; usually the previous one.
-\end_layout
-
-\begin_layout Enumerate
-Lock the corresponding list.
-\end_layout
-
-\begin_layout Enumerate
-If the top entry is -large enough, remove it from the list and return it.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, coalesce entries in the list.If there was no entry large enough,
- unlock the list and try the next largest list
-\end_layout
-
-\begin_layout Enumerate
-If no list has an entry which meets our needs, try the next free table.
-\end_layout
-
-\begin_layout Enumerate
-If no zone satisfies, expand the file.
-\end_layout
-
-\begin_layout Standard
-This optimizes rapid insert/delete of free list entries by not coalescing
- them all the time..
- First-fit address ordering ordering seems to be fairly good for keeping
- fragmentation low (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-).
- Note that address ordering does not need a tailer to coalesce, though if
- we needed one we could have one cheaply: see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-
-\end_inset
-
-.
-
-\end_layout
-
-\begin_layout Standard
-Each free entry has the free table number in the header: less than 255.
- It also contains a doubly-linked list for easy deletion.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-TDB Becomes Fragmented
-\end_layout
-
-\begin_layout Standard
-Much of this is a result of allocation strategy
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
-xas.edu/pub/garbage/malloc/ismm98.ps
-\end_layout
-
-\end_inset
-
- and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
-on) is deliberately set at 25%, and external fragmentation is only cured
- by the decision to repack the entire db when a transaction commit needs
- to enlarge the file.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The 25% overhead on allocation works in practice for ldb because indexes
- tend to expand by one record at a time.
- This internal fragmentation can be resolved by having an
-\begin_inset Quotes eld
-\end_inset
-
-expanded
-\begin_inset Quotes erd
-\end_inset
-
- bit in the header to note entries that have previously expanded, and allocating
- more space for them.
-\end_layout
-
-\begin_layout Standard
-There are is a spectrum of possible solutions for external fragmentation:
- one is to use a fragmentation-avoiding allocation strategy such as best-fit
- address-order allocator.
- The other end of the spectrum would be to use a bump allocator (very fast
- and simple) and simply repack the file when we reach the end.
-\end_layout
-
-\begin_layout Standard
-There are three problems with efficient fragmentation-avoiding allocators:
- they are non-trivial, they tend to use a single free list for each size,
- and there's no evidence that tdb allocation patterns will match those recorded
- for general allocators (though it seems likely).
-\end_layout
-
-\begin_layout Standard
-Thus we don't spend too much effort on external fragmentation; we will be
- no worse than the current code if we need to repack on occasion.
- More effort is spent on reducing freelist contention, and reducing overhead.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Records-Incur-A"
-
-\end_inset
-
-Records Incur A 28-Byte Overhead
-\end_layout
-
-\begin_layout Standard
-Each TDB record has a header as follows:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_record {
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_off_t next; /* offset of the next record in the list */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t rec_len; /* total byte length of record */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t key_len; /* byte length of key */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t data_len; /* byte length of data */
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t full_hash; /* the full 32 bit hash of the key */
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t magic;   /* try to catch errors */
-\end_layout
-
-\begin_layout LyX-Code
-        /* the following union is implied:
-\end_layout
-
-\begin_layout LyX-Code
-                union {
-\end_layout
-
-\begin_layout LyX-Code
-                        char record[rec_len];
-\end_layout
-
-\begin_layout LyX-Code
-                        struct {
-\end_layout
-
-\begin_layout LyX-Code
-                                char key[key_len];
-\end_layout
-
-\begin_layout LyX-Code
-                                char data[data_len];
-\end_layout
-
-\begin_layout LyX-Code
-                        }
-\end_layout
-
-\begin_layout LyX-Code
-                        uint32_t totalsize; (tailer)
-\end_layout
-
-\begin_layout LyX-Code
-                }
-\end_layout
-
-\begin_layout LyX-Code
-        */
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-Naively, this would double to a 56-byte overhead on a 64 bit implementation.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We can use various techniques to reduce this for an allocated block:
-\end_layout
-
-\begin_layout Enumerate
-The 'next' pointer is not required, as we are using a flat hash table.
-\end_layout
-
-\begin_layout Enumerate
-'rec_len' can instead be expressed as an addition to key_len and data_len
- (it accounts for wasted or overallocated length in the record).
- Since the record length is always a multiple of 8, we can conveniently
- fit it in 32 bits (representing up to 35 bits).
-\end_layout
-
-\begin_layout Enumerate
-'key_len' and 'data_len' can be reduced.
- I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
- the two into one 64-bit field and using a 5 bit value which indicates at
- what bit to divide the two.
- Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
- size of 32 bits.
-\end_layout
-
-\begin_layout Enumerate
-'full_hash' is used to avoid a memcmp on the
-\begin_inset Quotes eld
-\end_inset
-
-miss
-\begin_inset Quotes erd
-\end_inset
-
- case, but this is diminishing returns after a handful of bits (at 10 bits,
- it reduces 99.9% of false memcmp).
- As an aside, as the lower bits are already incorporated in the hash table
- resolution, the upper bits should be used here.
- Note that it's not clear that these bits will be a win, given the extra
- bits in the hash table itself (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Hash-Size-Solution"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Enumerate
-'magic' does not need to be enlarged: it currently reflects one of 5 values
- (used, free, dead, recovery, and unused_recovery).
- It is useful for quick sanity checking however, and should not be eliminated.
-\end_layout
-
-\begin_layout Enumerate
-'tailer' is only used to coalesce free blocks (so a block to the right can
- find the header to check if this block is free).
- This can be replaced by a single 'free' bit in the header of the following
- block (and the tailer only exists in free blocks).
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-This technique from Thomas Standish.
- Data Structure Techniques.
- Addison-Wesley, Reading, Massachusetts, 1980.
-\end_layout
-
-\end_inset
-
- The current proposed coalescing algorithm doesn't need this, however.
-\end_layout
-
-\begin_layout Standard
-This produces a 16 byte used header like this:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_used_record {
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t used_magic : 16,
-\end_layout
-
-\begin_layout LyX-Code
-
-\end_layout
-
-\begin_layout LyX-Code
-                 key_data_divide: 5,
-\end_layout
-
-\begin_layout LyX-Code
-                 top_hash: 11;
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t extra_octets;
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t key_and_data_len;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-And a free record like this:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_free_record {
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t free_magic: 8,
-\end_layout
-
-\begin_layout LyX-Code
-                   prev : 56;
-\end_layout
-
-\begin_layout LyX-Code
-
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t free_table: 8,
-\end_layout
-
-\begin_layout LyX-Code
-                 total_length : 56
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t next;;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1291206079
-
-\change_unchanged
-Note that by limiting valid offsets to 56 bits, we can pack everything we
- need into 3 64-byte words, meaning our minimum record size is 8 bytes.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Transaction Commit Requires 4 fdatasync
-\end_layout
-
-\begin_layout Standard
-The current transaction algorithm is:
-\end_layout
-
-\begin_layout Enumerate
-write_recovery_data();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-write_recovery_header();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-overwrite_with_new_data();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-remove_recovery_header();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Standard
-On current ext3, each sync flushes all data to disk, so the next 3 syncs
- are relatively expensive.
- But this could become a performance bottleneck on other filesystems such
- as ext4.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Neil Brown points out that this is overzealous, and only one sync is needed:
-\end_layout
-
-\begin_layout Enumerate
-Bundle the recovery data, a transaction counter and a strong checksum of
- the new data.
-\end_layout
-
-\begin_layout Enumerate
-Strong checksum that whole bundle.
-\end_layout
-
-\begin_layout Enumerate
-Store the bundle in the database.
-\end_layout
-
-\begin_layout Enumerate
-Overwrite the oldest of the two recovery pointers in the header (identified
- using the transaction counter) with the offset of this bundle.
-\end_layout
-
-\begin_layout Enumerate
-sync.
-\end_layout
-
-\begin_layout Enumerate
-Write the new data to the file.
-\end_layout
-
-\begin_layout Standard
-Checking for recovery means identifying the latest bundle with a valid checksum
- and using the new data checksum to ensure that it has been applied.
- This is more expensive than the current check, but need only be done at
- open.
- For running databases, a separate header field can be used to indicate
- a transaction in progress; we need only check for recovery if this is set.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Does-Not"
-
-\end_inset
-
-TDB Does Not Have Snapshot Support
-\end_layout
-
-\begin_layout Subsubsection
-Proposed SolutionNone.
- At some point you say
-\begin_inset Quotes eld
-\end_inset
-
-use a real database
-\begin_inset Quotes erd
-\end_inset
-
- (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Standard
-But as a thought experiment, if we implemented transactions to only overwrite
- free entries (this is tricky: there must not be a header in each entry
- which indicates whether it is free, but use of presence in metadata elsewhere),
- and a pointer to the hash table, we could create an entirely new commit
- without destroying existing data.
- Then it would be easy to implement snapshots in a similar way.
-\end_layout
-
-\begin_layout Standard
-This would not allow arbitrary changes to the database, such as tdb_repack
- does, and would require more space (since we have to preserve the current
- and future entries at once).
- If we used hash trees rather than one big hash table, we might only have
- to rewrite some sections of the hash, too.
-\end_layout
-
-\begin_layout Standard
-We could then implement snapshots using a similar method, using multiple
- different hash tables/free tables.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-Transactions Cannot Operate in Parallel
-\end_layout
-
-\begin_layout Standard
-This would be useless for ldb, as it hits the index records with just about
- every update.
- It would add significant complexity in resolving clashes, and cause the
- all transaction callers to write their code to loop in the case where the
- transactions spuriously failed.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-
-\end_inset
-
-).
- We could solve a small part of the problem by providing read-only transactions.
- These would allow one write transaction to begin, but it could not commit
- until all r/o transactions are done.
- This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
- commit.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-Default Hash Function Is Suboptimal
-\end_layout
-
-\begin_layout Standard
-The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
- if we expand it to 64 bits), and works best when the hash bucket size is
- a prime number (which also means a slow modulus).
- In addition, it is highly predictable which could potentially lead to a
- Denial of Service attack in some TDB uses.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The Jenkins lookup3 hash
-\begin_inset Foot
-status open
-
-\begin_layout Plain Layout
-http://burtleburtle.net/bob/c/lookup3.c
-\end_layout
-
-\end_inset
-
- is a fast and superbly-mixing hash.
- It's used by the Linux kernel and almost everything else.
- This has the particular properties that it takes an initial seed, and produces
- two 32 bit hash numbers, which we can combine into a 64-bit hash.
-\end_layout
-
-\begin_layout Standard
-The seed should be created at tdb-creation time from some random source,
- and placed in the header.
- This is far from foolproof, but adds a little bit of protection against
- hash bombing.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Reliable-Traversal-Adds"
-
-\end_inset
-
-Reliable Traversal Adds Complexity
-\end_layout
-
-\begin_layout Standard
-We lock a record during traversal iteration, and try to grab that lock in
- the delete code.
- If that grab on delete fails, we simply mark it deleted and continue onwards;
- traversal checks for this condition and does the delete when it moves off
- the record.
-\end_layout
-
-\begin_layout Standard
-If traversal terminates, the dead record may be left indefinitely.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove reliability guarantees; see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "traverse-Proposed-Solution"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Fcntl Locking Adds Overhead
-\end_layout
-
-\begin_layout Standard
-Placing a fcntl lock means a system call, as does removing one.
- This is actually one reason why transactions can be faster (everything
- is locked once at transaction start).
- In the uncontended case, this overhead can theoretically be eliminated.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
-\end_layout
-
-\begin_layout Standard
-We tried this before with spinlock support, in the early days of TDB, and
- it didn't make much difference except in manufactured benchmarks.
-\end_layout
-
-\begin_layout Standard
-We could use spinlocks (with futex kernel support under Linux), but it means
- that we lose automatic cleanup when a process dies with a lock.
- There is a method of auto-cleanup under Linux, but it's not supported by
- other operating systems.
- We could reintroduce a clear-if-first-style lock and sweep for dead futexes
- on open, but that wouldn't help the normal case of one concurrent opener
- dying.
- Increasingly elaborate repair schemes could be considered, but they require
- an ABI change (everyone must use them) anyway, so there's no need to do
- this at the same time as everything else.
-\end_layout
-
-\begin_layout Subsection
-Some Transactions Don't Require Durability
-\end_layout
-
-\begin_layout Standard
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
- usage, and occasionally empties the results into a transactional TDB.
- This kind of usage prioritizes performance over durability: as long as
- we are consistent, data can be lost.
-\end_layout
-
-\begin_layout Standard
-This would be more neatly implemented inside tdb: a
-\begin_inset Quotes eld
-\end_inset
-
-soft
-\begin_inset Quotes erd
-\end_inset
-
- transaction commit (ie.
- syncless) which meant that data may be reverted on a crash.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
-\end_layout
-
-\begin_layout Standard
-Unfortunately any transaction scheme which overwrites old data requires
- a sync before that overwrite to avoid the possibility of corruption.
-\end_layout
-
-\begin_layout Standard
-It seems possible to use a scheme similar to that described in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Does-Not"
-
-\end_inset
-
-,where transactions are committed without overwriting existing data, and
- an array of top-level pointers were available in the header.
- If the transaction is
-\begin_inset Quotes eld
-\end_inset
-
-soft
-\begin_inset Quotes erd
-\end_inset
-
- then we would not need a sync at all: existing processes would pick up
- the new hash table and free list and work with that.
-\end_layout
-
-\begin_layout Standard
-At some later point, a sync would allow recovery of the old data into the
- free lists (perhaps when the array of top-level pointers filled).
- On crash, tdb_open() would examine the array of top levels, and apply the
- transactions until it encountered an invalid checksum.
-\end_layout
-
-\begin_layout Subsection
-Tracing Is Fragile, Replay Is External
-\end_layout
-
-\begin_layout Standard
-The current TDB has compile-time-enabled tracing code, but it often breaks
- as it is not enabled by default.
- In a similar way, the ctdb code has an external wrapper which does replay
- tracing so it can coordinate cluster-wide transactions.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "replay-attribute"
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-Tridge points out that an attribute can be later added to tdb_open (see
-
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-
-\end_inset
-
-) to provide replay/trace hooks, which could become the basis for this and
- future parallel transactions and snapshot support.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\end_body
-\end_document
-@
-
-
-1.12
-log
-@Add status, some fixes, linked freelists.
-@
-text
-@d53 1
-a53 7
-
-\change_deleted 0 1291204535
-14-September
-\change_inserted 0 1291204533
-1-December
-\change_unchanged
--2010
-a580 2
-\change_inserted 0 1291204563
-
-a583 2
-
-\change_inserted 0 1291204572
-a587 2
-
-\change_inserted 0 1291204573
-a588 2
-\change_unchanged
-
-a629 2
-\change_inserted 0 1291204588
-
-a632 2
-
-\change_inserted 0 1291204588
-a636 2
-
-\change_inserted 0 1291204631
-a639 2
-\change_unchanged
-
-a693 2
-\change_inserted 0 1291204639
-
-a696 2
-
-\change_inserted 0 1291204640
-d702 1
-a702 1
-\change_inserted 0 1291204665
-d704 2
-a728 2
-\change_inserted 0 1291204671
-
-a731 2
-
-\change_inserted 0 1291204671
-a735 2
-
-\change_inserted 0 1291204673
-a736 2
-\change_unchanged
-
-a780 2
-\change_inserted 0 1291204731
-
-a783 2
-
-\change_inserted 0 1291204732
-a787 2
-
-\change_inserted 0 1291204779
-a790 2
-\change_unchanged
-
-a842 2
-\change_inserted 0 1291204830
-
-a845 2
-
-\change_inserted 0 1291204831
-a849 2
-
-\change_inserted 0 1291204834
-a850 2
-\change_unchanged
-
-d879 9
-a887 2
- deal of churn; we are better to guarantee that the tdb_errcode is per-thread
- so the current programming model can be maintained.
-d891 9
-d903 2
-a922 2
-\change_inserted 0 1291204847
-
-a925 2
-
-\change_inserted 0 1291204847
-d930 5
-a934 3
-
-\change_inserted 0 1291204852
-Incomplete.
-a1051 2
-\change_inserted 0 1291204881
-
-a1054 2
-
-\change_inserted 0 1291204881
-a1058 2
-
-\change_inserted 0 1291204885
-a1059 2
-\change_unchanged
-
-a1140 2
-\change_inserted 0 1291204898
-
-a1143 2
-
-\change_inserted 0 1291204898
-a1147 2
-
-\change_inserted 0 1291204901
-a1148 2
-\change_unchanged
-
-a1224 2
-\change_inserted 0 1291204908
-
-a1227 2
-
-\change_inserted 0 1291204908
-a1231 2
-
-\change_inserted 0 1291204908
-a1232 2
-\change_unchanged
-
-a1271 2
-\change_inserted 0 1291204917
-
-a1274 2
-
-\change_inserted 0 1291204917
-a1278 2
-
-\change_inserted 0 1291204920
-a1279 2
-\change_unchanged
-
-a1316 2
-\change_inserted 0 1291204927
-
-a1319 2
-
-\change_inserted 0 1291204928
-d1325 1
-a1325 1
-\change_inserted 0 1291204942
-d1327 2
-a1381 2
-\change_inserted 0 1291205003
-
-a1384 2
-
-\change_inserted 0 1291205004
-a1388 2
-
-\change_inserted 0 1291205007
-a1411 2
-\change_inserted 0 1291205019
-
-a1414 2
-
-\change_inserted 0 1291205019
-a1418 2
-
-\change_inserted 0 1291205023
-a1419 2
-\change_unchanged
-
-a1465 2
-\change_inserted 0 1291205029
-
-a1468 2
-
-\change_inserted 0 1291205029
-a1472 2
-
-\change_inserted 0 1291206020
-a1473 2
-\change_unchanged
-
-a1528 2
-\change_inserted 0 1291205043
-
-a1531 2
-
-\change_inserted 0 1291205043
-d1537 1
-a1537 1
-\change_inserted 0 1291205057
-d1539 2
-a1589 2
-\change_inserted 0 1291205062
-
-a1592 2
-
-\change_inserted 0 1291205062
-a1596 2
-
-\change_inserted 0 1291205062
-a1597 2
-\change_unchanged
-
-a1626 2
-\change_inserted 0 1291205072
-
-a1629 2
-
-\change_inserted 0 1291205073
-a1633 2
-
-\change_inserted 0 1291205073
-a1634 2
-\change_unchanged
-
-a1674 4
-
-\change_deleted 0 1291204504
-
-\change_unchanged
-a1699 2
-\change_inserted 0 1291205079
-
-a1702 2
-
-\change_inserted 0 1291205080
-a1706 2
-
-\change_inserted 0 1291205080
-a1707 2
-\change_unchanged
-
-a1833 2
-\change_inserted 0 1291205090
-
-d1869 2
-a1870 7
- is to divide the file into zones, and using a free list (or
-\change_inserted 0 1291205498
-table
-\change_deleted 0 1291205497
-set
-\change_unchanged
- of free lists) for each.
-a1871 2
-\change_inserted 0 1291205203
-
-a1874 2
-
-\change_inserted 0 1291205358
-a1890 21
-\change_unchanged
-
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1291205198
-Note that this means we need to split the free lists when we expand the
- file; this is probably acceptable when we double the hash table size, since
- that is such an expensive operation already.
- In the case of increasing the file size, there is an optimization we can
- use: if we use M in the formula above as the file size rounded up to the
- next power of 2, we only need reshuffle free lists when the file size crosses
- a power of 2 boundary,
-\emph on
-and
-\emph default
-reshuffling the free lists is trivial: we simply merge every consecutive
- pair of free lists.
-\change_unchanged
-
-d1899 1
-a1899 7
-Identify the correct
-\change_inserted 0 1291205366
-free list
-\change_deleted 0 1291205364
-zone
-\change_unchanged
-.
-d1907 2
-a1908 7
-Re-check the
-\change_inserted 0 1291205372
-list
-\change_deleted 0 1291205371
-zone
-\change_unchanged
- (we didn't have a lock, sizes could have changed): relock if necessary.
-d1912 1
-a1912 5
-Place the freed entry in the list
-\change_deleted 0 1291205382
- for that zone
-\change_unchanged
-.
-d1921 1
-a1921 15
-Pick a
-\change_deleted 0 1291205403
-zone either the zone we last freed into, or based on a
-\begin_inset Quotes eld
-\end_inset
-
-random
-\begin_inset Quotes erd
-\end_inset
-
- number.
-\change_inserted 0 1291205411
-free table; usually the previous one.
-\change_unchanged
-
-a1925 10
-\change_deleted 0 1291205432
-
-\end_layout
-
-\begin_layout Enumerate
-
-\change_deleted 0 1291205428
-Re-check the zone: relock if necessary.
-\change_unchanged
-
-d1934 1
-a1934 7
- unlock the list and try the next
-\change_inserted 0 1291205455
-largest list
-\change_deleted 0 1291205452
-zone.
-\change_inserted 0 1291205457
-
-a1937 2
-
-\change_inserted 0 1291205476
-a1938 2
-\change_unchanged
-
-a1966 2
-\change_inserted 0 1291205542
-
-a1969 2
-
-\change_inserted 0 1291205591
-a1971 70
-\change_unchanged
-
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1291205539
-I anticipate that the number of entries in each free zone would be small,
- but it might be worth using one free entry to hold pointers to the others
- for cache efficiency.
-\change_unchanged
-
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1291205534
-\begin_inset CommandInset label
-LatexCommand label
-name "freelist-in-zone"
-
-\end_inset
-
-If we want to avoid locking complexity (enlarging the free lists when we
- enlarge the file) we could place the array of free lists at the beginning
- of each zone.
- This means existing array lists never move, but means that a record cannot
- be larger than a zone.
- That in turn implies that zones should be variable sized (say, power of
- 2), which makes the question
-\begin_inset Quotes eld
-\end_inset
-
-what zone is this record in?
-\begin_inset Quotes erd
-\end_inset
-
- much harder (and
-\begin_inset Quotes eld
-\end_inset
-
-pick a random zone
-\begin_inset Quotes erd
-\end_inset
-
-, but that's less common).
- It could be done with as few as 4 bits from the record header.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-Using
-\begin_inset Formula $2^{16+N*3}$
-\end_inset
-
-means 0 gives a minimal 65536-byte zone, 15 gives the maximal
-\begin_inset Formula $2^{61}$
-\end_inset
-
- byte zone.
- Zones range in factor of 8 steps.
- Given the zone size for the zone the current record is in, we can determine
- the start of the zone.
-\end_layout
-
-\end_inset
-
-
-\change_inserted 0 1291205139
-
-d2218 1
-a2218 5
-        uint32_t
-\change_inserted 0 1291205758
-used_
-\change_unchanged
-magic : 16,
-a2222 4
-\change_deleted 0 1291205693
-                 prev_is_free: 1,
-\change_unchanged
-
-d2230 1
-a2230 7
-                 top_hash: 1
-\change_inserted 0 1291205704
-1
-\change_deleted 0 1291205704
-0
-\change_unchanged
-;
-d2254 1
-a2254 9
-        uint
-\change_inserted 0 1291205725
-64
-\change_deleted 0 1291205723
-32
-\change_unchanged
-_t
-\change_inserted 0 1291205753
-free_magic: 8,
-a2257 2
-
-\change_inserted 0 1291205746
-a2262 24
-\change_deleted 0 1291205749
-free_magic;
-\change_unchanged
-
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t
-\change_inserted 0 1291205786
-free_table: 8,
-\end_layout
-
-\begin_layout LyX-Code
-
-\change_inserted 0 1291205788
-
-\change_unchanged
-total_length
-\change_inserted 0 1291205792
- : 56
-\change_deleted 0 1291205790
-;
-\change_unchanged
-
-d2266 1
-a2266 7
-        uint64_t
-\change_deleted 0 1291205801
-prev,
-\change_unchanged
-next;
-\change_deleted 0 1291205811
-
-d2270 1
-a2270 3
-
-\change_deleted 0 1291205811
-        ...
-d2274 1
-a2274 5
-
-\change_deleted 0 1291205808
-        uint64_t tailer
-\change_unchanged
-;
-d2283 5
-a2287 16
-\change_deleted 0 1291205827
-We might want to take some bits from the used record's top_hash (and the
- free record which has 32 bits of padding to spare anyway) if we use variable
- sized zones.
- See
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "freelist-in-zone"
-
-\end_inset
-
-.
-
-\change_inserted 0 1291205885
- Note that by limiting valid offsets to 56 bits, we can pack everything
- we need into 3 64-byte words, meaning our minimum record size is 8 bytes.
-a2290 2
-
-\change_inserted 0 1291205886
-a2294 2
-
-\change_inserted 0 1291205886
-a2295 2
-\change_unchanged
-
-a2385 2
-\change_inserted 0 1291205894
-
-a2388 2
-
-\change_inserted 0 1291205894
-a2392 2
-
-\change_inserted 0 1291205902
-a2393 2
-\change_unchanged
-
-a2415 4
-
-\change_deleted 0 1291204504
-
-\change_unchanged
-a2445 2
-\change_inserted 0 1291205910
-
-a2448 2
-
-\change_inserted 0 1291205910
-a2452 2
-
-\change_inserted 0 1291205914
-a2453 2
-\change_unchanged
-
-a2485 2
-\change_inserted 0 1291205919
-
-a2488 2
-
-\change_inserted 0 1291205919
-a2492 2
-
-\change_inserted 0 1291205922
-a2493 2
-\change_unchanged
-
-a2533 2
-\change_inserted 0 1291205929
-
-a2536 2
-
-\change_inserted 0 1291205929
-a2540 2
-
-\change_inserted 0 1291205929
-a2541 2
-\change_unchanged
-
-a2578 2
-\change_inserted 0 1291205932
-
-a2581 2
-
-\change_inserted 0 1291205933
-a2585 2
-
-\change_inserted 0 1291205933
-a2586 2
-\change_unchanged
-
-a2724 2
-\change_inserted 0 1291205944
-
-a2727 2
-
-\change_inserted 0 1291205945
-a2731 2
-
-\change_inserted 0 1291205948
-a2732 2
-\change_unchanged
-
-@
-
-
-1.11
-log
-@Merge changes
-@
-text
-@d53 7
-a59 1
-14-September-2010
-d587 16
-d644 18
-d716 16
-d753 16
-d813 18
-d883 16
-d953 16
-d1084 16
-d1181 16
-d1273 16
-d1328 16
-d1381 16
-d1447 19
-a1465 2
- if older code (which doesn't understand the feature) writes to the database.Reco
-rd Headers Are Not Expandible
-d1484 16
-d1546 16
-d1617 16
-d1680 16
-d1725 16
-d1810 16
-d1951 8
-a1958 3
-Proposed SolutionThe first step is to remove all the current heuristics,
- as they obviously interact, then examine them once the lock contention
- is addressed.
-d1989 7
-a1995 2
- is to divide the file into zones, and using a free list (or set of free
- lists) for each.
-d1997 2
-d2002 25
-d2039 2
-d2049 7
-a2055 1
-Identify the correct zone.
-d2063 7
-a2069 2
-Re-check the zone (we didn't have a lock, sizes could have changed): relock
- if necessary.
-d2073 5
-a2077 1
-Place the freed entry in the list for that zone.
-d2086 3
-a2088 1
-Pick a zone either the zone we last freed into, or based on a
-d2097 4
-d2105 2
-d2110 2
-d2113 2
-d2123 15
-a2137 1
- unlock the list and try the next zone.
-d2166 11
-d2180 2
-d2185 2
-d2190 2
-d2223 1
-a2223 1
-status open
-d2243 2
-d2491 5
-a2495 1
-        uint32_t magic : 16,
-d2499 2
-d2502 2
-d2511 7
-a2517 1
-                 top_hash: 10;
-d2541 29
-a2569 1
-        uint32_t free_magic;
-d2573 11
-a2583 1
-        uint64_t total_length;
-d2587 7
-a2593 1
-        uint64_t prev, next;
-d2597 2
-d2603 5
-a2607 1
-        uint64_t tailer;
-d2615 2
-d2628 18
-d2736 16
-d2808 16
-d2856 16
-d2912 16
-d2965 16
-d3119 16
-@
-
-
-1.10
-log
-@Tracing attribute, talloc support.
-@
-text
-@d1 1
-a1 1
-#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
-d53 1
-a53 7
-
-\change_deleted 0 1283307542
-26-July
-\change_inserted 0 1284423485
-14-September
-\change_unchanged
--2010
-a472 2
-\change_inserted 0 1284422789
-
-a479 2
-\change_unchanged
-
-a838 2
-
-\change_inserted 0 1284016998
-a846 2
-\change_unchanged
-
-a1194 2
-\change_inserted 0 1284015637
-
-a1197 2
-
-\change_inserted 0 1284015716
-a1201 2
-
-\change_inserted 0 1284015906
-a1210 2
-
-\change_inserted 0 1284015637
-a1214 2
-
-\change_inserted 0 1284016114
-a1227 2
-
-\change_inserted 0 1284016149
-a1232 2
-
-\change_inserted 0 1284016639
-a1237 2
-
-\change_inserted 0 1284016821
-a1243 2
-
-\change_inserted 0 1284016803
-d1245 2
-a1246 9
- if older code (which doesn't understand the feature) writes to the database.
-\change_deleted 0 1284016101
-
-\end_layout
-
-\begin_layout Subsection
-
-\change_inserted 0 1284015634
-Record Headers Are Not Expandible
-a1249 2
-
-\change_inserted 0 1284015634
-a1254 2
-
-\change_inserted 0 1284015634
-a1258 2
-
-\change_inserted 0 1284422552
-a1267 2
-
-\change_inserted 0 1284422568
-a1271 2
-
-\change_inserted 0 1284422646
-a1276 2
-
-\change_inserted 0 1284422656
-a1280 2
-
-\change_inserted 0 1284423065
-a1305 2
-
-\change_inserted 0 1284423042
-a1310 2
-\change_unchanged
-
-a1457 2
-
-\change_inserted 0 1283336713
-a1463 2
-
-\change_unchanged
-d1482 2
-d1485 1
-a1485 51
-\change_deleted 0 1283307675
-There are three details which become important:
-\end_layout
-
-\begin_layout Enumerate
-
-\change_deleted 0 1283307675
-On encountering a full bucket, we use the next bucket.
-\end_layout
-
-\begin_layout Enumerate
-
-\change_deleted 0 1283307675
-Extra hash bits are stored with the offset, to reduce comparisons.
-\end_layout
-
-\begin_layout Enumerate
-
-\change_deleted 0 1283307675
-A marker entry is used on deleting an entry.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1283307675
-The doubling of the table must be done under a transaction; we will not
- reduce it on deletion, so it will be an unusual case.
- It will either be placed at the head (other entries will be moved out the
- way so we can expand).
- We could have a pointer in the header to the current hashtable location,
- but that pointer would have to be read frequently to check for hashtable
- moves.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1283307675
-The locking for this is slightly more complex than the chained case; we
- currently have one lock per bucket, and that means we would need to expand
- the lock if we overflow to the next bucket.
- The frequency of such collisions will effect our locking heuristics: we
- can always lock more buckets than we need.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1283307675
-One possible optimization is to only re-check the hash size on an insert
- or a lookup miss.
-
-\change_inserted 0 1283307770
-a1492 2
-
-\change_inserted 0 1283336187
-a1500 2
-
-\change_inserted 0 1283336586
-a1510 2
-\change_unchanged
-
-d1636 3
-a1638 8
-Proposed Solution
-\change_deleted 0 1283336858
-
-\end_layout
-
-\begin_layout Standard
-The first step is to remove all the current heuristics, as they obviously
- interact, then examine them once the lock contention is addressed.
-a1647 2
-\change_inserted 0 1283336910
-
-a1650 2
-
-\change_inserted 0 1283337052
-a1655 2
-\change_unchanged
-
-a1776 2
-\change_inserted 0 1283309850
-
-a1779 2
-
-\change_inserted 0 1283337216
-a1813 2
-
-\change_inserted 0 1284424151
-a1825 2
-\change_unchanged
-
-a1830 2
-\change_unchanged
-
-a2031 2
-
-\change_inserted 0 1283336739
-a2040 2
-\change_unchanged
-
-a2117 2
-\change_inserted 0 1283337133
-
-a2120 2
-
-\change_inserted 0 1283337139
-a2121 2
-\change_unchanged
-
-a2136 2
-
-\change_inserted 0 1283337235
-a2147 2
-\change_unchanged
-
-d2251 1
-a2251 7
-Proposed Solution
-\change_deleted 0 1284423472
-
-\end_layout
-
-\begin_layout Standard
-None.
-d2261 1
-a2261 1
-\change_inserted 0 1284423891
-d2263 1
-a2263 4
-\change_deleted 0 1284423891
-.
-
-\change_inserted 0 1284423901
-a2271 2
-\change_unchanged
-
-a2293 2
-\change_inserted 0 1284423495
-
-a2312 2
-
-\change_inserted 0 1284424201
-d2321 1
-a2321 3
-
-\change_unchanged
-We could solve a small part of the problem by providing read-only transactions.
-a2505 2
-\change_inserted 0 1284423555
-
-a2508 2
-
-\change_inserted 0 1284423617
-a2512 2
-
-\change_inserted 0 1284423719
-a2519 2
-
-\change_inserted 0 1284423864
-a2530 2
-
-\change_inserted 0 1284423850
-a2540 2
-\change_unchanged
-
-@
-
-
-1.9
-log
-@Extension mechanism.
-@
-text
-@d56 2
-a57 2
-\change_inserted 0 1284016854
-9-September
-d479 11
-d1303 1
-a1303 1
-\change_inserted 0 1284016847
-d1310 56
-d1945 1
-a1945 1
-\change_inserted 0 1283310945
-d1956 2
-d2402 2
-d2416 4
-d2421 12
-d2455 2
-d2476 12
-d2673 47
-@
-
-
-1.8
-log
-@Remove bogus footnote
-@
-text
-@d56 2
-a57 2
-\change_inserted 0 1283307544
-1-September
-d838 12
-d1198 103
-@
-
-
-1.7
-log
-@Moving hash table does not work.
-@
-text
-@a1436 12
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-
-\change_inserted 0 1283336450
-If we make the hash offsets zone-relative, then this only restricts the
- zone size, not the overall database size.
-\end_layout
-
-\end_inset
-
-@
-
-
-1.6
-log
-@Commit changes
-@
-text
-@d38 1
-a38 1
-\author ""
-d53 7
-a59 1
-26-July-2010
-d1333 10
-d1361 3
-a1363 1
- There are three details which become important:
-d1367 2
-d1373 2
-d1379 2
-d1385 2
-d1397 2
-d1407 2
-d1411 45
-d1582 2
-d1598 14
-d1733 62
-d1996 13
-d2086 10
-d2110 15
-a2124 1
-\begin_layout LyX-Code
-@
-
-
-1.5
-log
-@Soft transaction commit
-@
-text
-@d38 1
-a38 1
-\author "Rusty Russell,,,"
-a52 4
-
-\change_deleted 0 1280141199
-10-May-2010
-\change_inserted 0 1280141202
-a53 2
-\change_unchanged
-
-a2028 2
-
-\change_inserted 0 1280140902
-a2034 2
-
-\change_unchanged
-a2212 2
-\change_inserted 0 1280140661
-
-a2215 2
-
-\change_inserted 0 1280140703
-a2219 2
-
-\change_inserted 0 1280708312
-a2226 2
-
-\change_inserted 0 1280708400
-a2239 2
-
-\change_inserted 0 1280140836
-a2243 2
-
-\change_inserted 0 1280708255
-a2247 2
-
-\change_inserted 0 1280708374
-a2252 2
-
-\change_inserted 0 1280141181
-a2274 2
-
-\change_inserted 0 1280141345
-@
-
-
-1.4
-log
-@Merge changes
-@
-text
-@d38 1
-a38 1
-\author ""
-d53 2
-d56 4
-d2035 10
-d2223 84
-@
-
-
-1.3
-log
-@Transaction and freelist rethink.
-@
-text
-@d38 1
-a38 1
-\author "Rusty Russell,,,"
-d53 1
-a53 1
-27-April-2010
-d662 1
-a662 5
- behavior of disallowing
-\change_inserted 0 1272940179
-nested
-\change_unchanged
-transactions should become the default.
-a1210 2
-\change_inserted 0 1272944650
-
-a1214 2
-
-\change_inserted 0 1272944763
-a1218 2
-\change_unchanged
-
-a1223 2
-\change_unchanged
-
-a1301 2
-
-\change_inserted 0 1273478114
-a1310 2
-\change_unchanged
-
-d1515 1
-a1515 11
-The free list
-\change_deleted 0 1273469807
-should
-\change_inserted 0 1273469810
-must
-\change_unchanged
- be split
-\change_deleted 0 1273469815
-into multiple lists
-\change_unchanged
-to reduce contention.
-a1520 2
-\change_inserted 0 1273470006
-
-a1523 2
-
-\change_inserted 0 1273492055
-a1539 2
-
-\change_inserted 0 1273483888
-a1551 2
-\change_unchanged
-
-a1554 8
-
-\change_deleted 0 1272942055
-There are various ways to organize these lisys, but because we want to be
- able to quickly identify which free list an entry is in, and reduce the
- number of locks required for merging, we will use zoning (eg.
- each free list covers some fixed fraction of the file).
-
-\change_inserted 0 1273484187
-d1556 1
-a1556 7
-
-\change_deleted 0 1273484194
-The algorithm for f
-\change_inserted 0 1273484194
-F
-\change_unchanged
-reeing is simple:
-d1560 1
-a1560 7
-Identify the correct
-\change_deleted 0 1273482856
-free list
-\change_inserted 0 1273482857
-zone
-\change_unchanged
-.
-d1564 1
-a1564 7
-Lock the
-\change_inserted 0 1273482895
-corresponding
-\change_unchanged
-list
-\change_inserted 0 1273482863
-.
-a1567 2
-
-\change_inserted 0 1273482909
-d1573 1
-a1573 13
-
-\change_deleted 0 1273482885
-, and p
-\change_inserted 0 1273482888
-P
-\change_unchanged
-lace the freed entry
-\change_deleted 0 1273492415
-at the head
-\change_inserted 0 1273492415
-in the list for that zone
-\change_unchanged
-.
-d1577 2
-a1578 7
-Allocation is a little more complicated, as we
-\change_deleted 0 1273483240
-merge entries as we walk the list:
-\change_inserted 0 1273484250
-perform delayed coalescing at this point:
-\change_unchanged
-
-d1582 1
-a1582 19
-Pick a
-\change_deleted 0 1273482955
-free list;
-\change_inserted 0 1273482957
-zone
-\change_unchanged
- either the
-\change_deleted 0 1273482962
-list
-\change_inserted 0 1273482962
-zone
-\change_unchanged
- we last freed
-\change_deleted 0 1273482966
-o
-\change_inserted 0 1273482966
-i
-\change_unchanged
-nto, or based on a
-d1594 1
-a1594 9
-Lock th
-\change_inserted 0 1273482980
-e corresponding
-\change_deleted 0 1273482973
-at
-\change_unchanged
- list.
-\change_inserted 0 1273482982
-
-a1597 2
-
-\change_inserted 0 1273483084
-a1598 53
-\change_unchanged
-
-\end_layout
-
-\begin_layout Enumerate
-If the top entry is
-\change_deleted 0 1273492155
-well-sized,
-\change_inserted 0 1273492159
--large enough,
-\change_unchanged
-remove it from the list and return it.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise,
-\change_inserted 0 1273492206
-coalesce entries in the list.
-\change_deleted 0 1273492200
-examine the entry to the right of it in the file.
- If it is free:
-\end_layout
-
-\begin_deeper
-\begin_layout Enumerate
-
-\change_deleted 0 1273492200
-If that entry is in a different list, lock that list too.
-\end_layout
-
-\begin_layout Enumerate
-
-\change_deleted 0 1273492200
-If we had to place a new lock, re-check that the entry is free.
-\end_layout
-
-\begin_layout Enumerate
-
-\change_deleted 0 1273492200
-Remove that entry from its free list and expand this entry to cover it.
-\end_layout
-
-\begin_layout Enumerate
-
-\change_deleted 0 1273485554
-Goto step 3.
-\end_layout
-
-\end_deeper
-\begin_layout Enumerate
-
-\change_inserted 0 1273485311
-If there was no entry large enough, unlock the list and try the next zone.
-d1602 1
-a1602 5
-
-\change_deleted 0 1273483646
-Repeat step 3 with each entry in the list.
-\change_unchanged
-
-d1606 2
-a1607 5
-
-\change_deleted 0 1273483668
-Unlock the list and repeat step 2 with the next list.
-\change_unchanged
-
-d1611 1
-a1611 7
-If no
-\change_deleted 0 1273483671
-list
-\change_inserted 0 1273483671
-zone
-\change_unchanged
- satisfies, expand the file.
-d1615 2
-a1616 9
-This optimizes rapid insert/delete of free list entries
-\change_inserted 0 1273485794
- by not coalescing them all the time.
-\change_deleted 0 1273483685
-, and allows us to get rid of the tailer altogether
-\change_unchanged
-.
-
-\change_inserted 0 1273492299
-a1638 39
-
-\change_deleted 0 1273476840
-The question of
-\begin_inset Quotes eld
-\end_inset
-
-well-sized
-\begin_inset Quotes erd
-\end_inset
-
- free entries is more difficult: the 25% overhead works in practice for
- ldb because indexes tend to expand by one record at a time.
- This can be resolved by having an
-\begin_inset Quotes eld
-\end_inset
-
-expanded
-\begin_inset Quotes erd
-\end_inset
-
- bit in the header to note entries that have previously expanded, and allocating
- more space for them.
- Whether the
-\begin_inset Quotes eld
-\end_inset
-
-increasing slack
-\begin_inset Quotes erd
-\end_inset
-
- algorithm should be implemented or first-fit used is still unknown: we
- will determine this once these other ideas are implemented.
-\change_inserted 0 1273483750
-
-\end_layout
-
-\begin_layout Standard
-
-\change_inserted 0 1273492450
-a1644 2
-
-\change_inserted 0 1273470441
-a1654 2
-
-\change_inserted 0 1273476556
-a1659 2
-
-\change_inserted 0 1273470423
-a1661 2
-\change_unchanged
-
-a1672 2
-
-\change_inserted 0 1273476847
-a1676 2
-
-\change_inserted 0 1273476886
-a1691 2
-
-\change_inserted 0 1273477233
-a1699 2
-
-\change_inserted 0 1273477534
-a1706 2
-
-\change_inserted 0 1273482700
-a1712 2
-
-\change_inserted 0 1273478079
-a1722 2
-
-\change_inserted 0 1273477839
-a1726 2
-
-\change_inserted 0 1273477925
-a1730 2
-
-\change_inserted 0 1273477925
-a1734 2
-
-\change_inserted 0 1273477925
-a1738 2
-
-\change_inserted 0 1273477925
-a1742 2
-
-\change_inserted 0 1273477925
-a1746 2
-
-\change_inserted 0 1273477925
-a1750 2
-
-\change_inserted 0 1273477925
-a1754 2
-
-\change_inserted 0 1273477925
-a1758 2
-
-\change_inserted 0 1273477925
-a1762 2
-
-\change_inserted 0 1273477925
-a1766 2
-
-\change_inserted 0 1273477925
-a1770 2
-
-\change_inserted 0 1273477925
-a1774 2
-
-\change_inserted 0 1273477925
-a1778 2
-
-\change_inserted 0 1273477925
-a1782 2
-
-\change_inserted 0 1273477925
-a1786 2
-
-\change_inserted 0 1273477925
-a1790 2
-
-\change_inserted 0 1273477925
-a1794 2
-
-\change_inserted 0 1273477925
-a1798 2
-
-\change_inserted 0 1273492522
-a1802 2
-
-\change_inserted 0 1273492530
-a1806 2
-
-\change_inserted 0 1273492546
-a1810 2
-
-\change_inserted 0 1273478239
-a1814 2
-
-\change_inserted 0 1273479960
-a1821 2
-
-\change_inserted 0 1273480265
-a1830 2
-
-\change_inserted 0 1273480354
-a1845 2
-
-\change_inserted 0 1273478968
-a1851 2
-
-\change_inserted 0 1273492604
-a1859 2
-
-\change_inserted 0 1273479572
-a1862 2
-\change_unchanged
-
-a1870 2
-
-\change_inserted 0 1273480282
-a1874 2
-
-\change_inserted 0 1273478931
-a1878 2
-
-\change_inserted 0 1273481549
-a1882 2
-
-\change_inserted 0 1273481557
-a1886 2
-
-\change_inserted 0 1273480307
-a1890 2
-
-\change_inserted 0 1273480335
-a1894 2
-
-\change_inserted 0 1273479897
-a1898 2
-
-\change_inserted 0 1273479653
-a1902 2
-
-\change_inserted 0 1273480371
-a1906 2
-
-\change_inserted 0 1273480464
-a1910 2
-
-\change_inserted 0 1273480399
-a1914 2
-
-\change_inserted 0 1273480425
-a1918 2
-
-\change_inserted 0 1273480453
-a1922 2
-
-\change_inserted 0 1273480455
-a1926 2
-
-\change_inserted 0 1273480450
-a1930 2
-
-\change_inserted 0 1273480452
-a1935 2
-\change_inserted 0 1273478830
-
-a1942 5
-
-\change_deleted 0 1273481604
-In theory, we could get away with 2: one after we write the new data, and
- one to somehow atomically change over to it.
-\change_inserted 0 1273481632
-a1946 2
-
-\change_inserted 0 1273481724
-a1950 2
-
-\change_inserted 0 1273481713
-a1954 2
-
-\change_inserted 0 1273481717
-a1958 2
-
-\change_inserted 0 1273481730
-a1962 2
-
-\change_inserted 0 1273481736
-a1966 2
-
-\change_inserted 0 1273481744
-a1970 2
-
-\change_inserted 0 1273481748
-a1974 2
-
-\change_inserted 0 1273482185
-a1978 2
-
-\change_inserted 0 1273482259
-a1989 50
-
-\change_deleted 0 1273481848
-None.
- Trying to rewrite the transaction code is a separate experiment, which
- I encourage someone else to do.
- At some point you say
-\begin_inset Quotes eld
-\end_inset
-
-use a real database
-\begin_inset Quotes erd
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1273481848
-But as a thought experiment:
-\change_unchanged
-
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1273481788
-Say there was a pointer in the header which said where the hash table and
- free list tables were, and that no blocks were labeled with whether they
- were free or not (it had to be derived from what list they were in).
- We could create new hash table and free list in some free space, and populate
- it as we want the post-committed state to look.
- Then we sync, then we switch the offset in the header, then we sync again.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1273481788
-This would not allow arbitrary changes to the database, such as tdb_repack
- does, and would require more space (since we have to preserve the current
- and future entries at once).
- If we used hash trees rather than one big hash table, we might only have
- to rewrite some sections of the hash, too.
-\change_inserted 0 1273481854
-
-\end_layout
-
-\begin_layout Standard
-
-\change_inserted 0 1273482102
-a1993 2
-
-\change_inserted 0 1273482061
-a1998 2
-
-\change_inserted 0 1273482063
-a2002 2
-
-\change_inserted 0 1273482072
-a2006 2
-
-\change_inserted 0 1273482139
-a2011 2
-
-\change_inserted 0 1273482364
-a2015 2
-
-\change_inserted 0 1273482163
-a2019 2
-
-\change_inserted 0 1273482493
-a2037 2
-
-\change_inserted 0 1273482536
-a2046 2
-\change_unchanged
-
-a2049 2
-
-\change_inserted 0 1273482641
-a2058 2
-
-\change_inserted 0 1273481827
-d2067 2
-a2068 11
-We could
-\change_inserted 0 1273481829
-then
-\change_unchanged
-implement snapshots using a similar method
-\change_deleted 0 1273481838
- to the above, only
-\change_inserted 0 1273481840
-,
-\change_unchanged
- using multiple different hash tables/free tables.
-@
-
-
-1.2
-log
-@After first feedback (Ronnie & Volker)
-@
-text
-@d1314 13
-d1531 11
-a1541 1
-The free list should be split into multiple lists to reduce contention.
-d1547 39
-d1596 7
-d1604 1
-a1604 1
-The algorithm for freeing is simple:
-d1608 7
-a1614 1
-Identify the correct free list.
-d1618 30
-a1647 1
-Lock the list, and place the freed entry at the head.
-d1651 7
-a1657 2
-Allocation is a little more complicated, as we merge entries as we walk
- the list:
-d1661 19
-a1679 1
-Pick a free list; either the list we last freed onto, or based on a
-d1691 17
-a1707 1
-Lock that list.
-d1711 7
-a1717 1
-If the top entry is well-sized, remove it from the list and return it.
-d1721 5
-a1725 1
-Otherwise, examine the entry to the right of it in the file.
-d1731 2
-d1737 2
-d1743 2
-d1749 2
-d1756 8
-d1765 2
-d1770 2
-d1773 2
-d1778 7
-a1784 1
-If no list satisfies, expand the file.
-d1788 28
-a1815 2
-This optimizes rapid insert/delete of free list entries, and allows us to
- get rid of the tailer altogether.
-d1819 2
-d1851 1
-a1851 1
-\change_inserted 0 1272941474
-d1857 303
-a2159 18
-\change_inserted 0 1272942759
-There are various ways to organize these lists, but because we want to be
- able to quickly identify which free list an entry is in, and reduce the
- number of locks required for merging, we will use zoning (eg.
- each of the N free lists in a tdb file of size M covers a fixed fraction
- M/N).
- Note that this means we need to reshuffle the free lists when we expand
- the file; this is probably acceptable when we double the hash table size,
- since that is such an expensive operation already.
- In the case of increasing the file size, there is an optimization we can
- use: if we use M in the formula above as the file size rounded up to the
- next power of 2, we only need reshuffle free lists when the file size crosses
- a power of 2 boundary,
-\emph on
-and
-\emph default
-reshuffling the free lists is trivial: we simply merge every consecutive
- pair of free lists.
-d2164 107
-d2276 2
-d2280 59
-d2346 2
-d2363 2
-d2366 2
-d2371 2
-d2382 2
-d2389 57
-d2458 13
-d2474 32
-a2505 2
-We could implement snapshots using a similar method to the above, only using
- multiple different hash tables/free tables.
-@
-
-
-1.1
-log
-@Initial revision
-@
-text
-@d1 1
-a1 1
-#LyX 1.6.4 created this file. For more info see http://www.lyx.org/
-d36 3
-a38 3
-\tracking_changes false
-\output_changes false
-\author ""
-d662 5
-a666 1
- behavior of disallowing transactions should become the default.
-d1215 21
-d1527 2
-d1533 3
-a1535 1
- The algorithm for freeing is simple:
-d1642 26
-@
diff --git a/lib/tdb2/doc/design.pdf b/lib/tdb2/doc/design.pdf
deleted file mode 100644
index 558dc1f8c2..0000000000
Binary files a/lib/tdb2/doc/design.pdf and /dev/null differ
diff --git a/lib/tdb2/doc/design.txt b/lib/tdb2/doc/design.txt
deleted file mode 100644
index bd2ffde4db..0000000000
--- a/lib/tdb2/doc/design.txt
+++ /dev/null
@@ -1,1258 +0,0 @@
-TDB2: A Redesigning The Trivial DataBase
-
-Rusty Russell, IBM Corporation
-
-1-December-2010
-
-Abstract
-
-The Trivial DataBase on-disk format is 32 bits; with usage cases
-heading towards the 4G limit, that must change. This required
-breakage provides an opportunity to revisit TDB's other design
-decisions and reassess them.
-
-1 Introduction
-
-The Trivial DataBase was originally written by Andrew Tridgell as
-a simple key/data pair storage system with the same API as dbm,
-but allowing multiple readers and writers while being small
-enough (< 1000 lines of C) to include in SAMBA. The simple design
-created in 1999 has proven surprisingly robust and performant,
-used in Samba versions 3 and 4 as well as numerous other
-projects. Its useful life was greatly increased by the
-(backwards-compatible!) addition of transaction support in 2005.
-
-The wider variety and greater demands of TDB-using code has lead
-to some organic growth of the API, as well as some compromises on
-the implementation. None of these, by themselves, are seen as
-show-stoppers, but the cumulative effect is to a loss of elegance
-over the initial, simple TDB implementation. Here is a table of
-the approximate number of lines of implementation code and number
-of API functions at the end of each year:
-
-
-+-----------+----------------+--------------------------------+
-| Year End  | API Functions  | Lines of C Code Implementation |
-+-----------+----------------+--------------------------------+
-+-----------+----------------+--------------------------------+
-|   1999    |      13        |              1195              |
-+-----------+----------------+--------------------------------+
-|   2000    |      24        |              1725              |
-+-----------+----------------+--------------------------------+
-|   2001    |      32        |              2228              |
-+-----------+----------------+--------------------------------+
-|   2002    |      35        |              2481              |
-+-----------+----------------+--------------------------------+
-|   2003    |      35        |              2552              |
-+-----------+----------------+--------------------------------+
-|   2004    |      40        |              2584              |
-+-----------+----------------+--------------------------------+
-|   2005    |      38        |              2647              |
-+-----------+----------------+--------------------------------+
-|   2006    |      52        |              3754              |
-+-----------+----------------+--------------------------------+
-|   2007    |      66        |              4398              |
-+-----------+----------------+--------------------------------+
-|   2008    |      71        |              4768              |
-+-----------+----------------+--------------------------------+
-|   2009    |      73        |              5715              |
-+-----------+----------------+--------------------------------+
-
-
-This review is an attempt to catalog and address all the known
-issues with TDB and create solutions which address the problems
-without significantly increasing complexity; all involved are far
-too aware of the dangers of second system syndrome in rewriting a
-successful project like this.
-
-2 API Issues
-
-2.1 tdb_open_ex Is Not Expandable
-
-The tdb_open() call was expanded to tdb_open_ex(), which added an
-optional hashing function and an optional logging function
-argument. Additional arguments to open would require the
-introduction of a tdb_open_ex2 call etc.
-
-2.1.1 Proposed Solution<attributes>
-
-tdb_open() will take a linked-list of attributes:
-
-enum tdb_attribute {
-
-    TDB_ATTRIBUTE_LOG = 0,
-
-    TDB_ATTRIBUTE_HASH = 1
-
-};
-
-struct tdb_attribute_base {
-
-    enum tdb_attribute attr;
-
-    union tdb_attribute *next;
-
-};
-
-struct tdb_attribute_log {
-
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
-*/
-
-    tdb_log_func log_fn;
-
-    void *log_private;
-
-};
-
-struct tdb_attribute_hash {
-
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
-*/
-
-    tdb_hash_func hash_fn;
-
-    void *hash_private;
-
-};
-
-union tdb_attribute {
-
-    struct tdb_attribute_base base;
-
-    struct tdb_attribute_log log;
-
-    struct tdb_attribute_hash hash;
-
-};
-
-This allows future attributes to be added, even if this expands
-the size of the union.
-
-2.1.2 Status
-
-Complete.
-
-2.2 tdb_traverse Makes Impossible Guarantees
-
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
-and it was thought that it was important to guarantee that all
-records which exist at the start and end of the traversal would
-be included, and no record would be included twice.
-
-This adds complexity (see[Reliable-Traversal-Adds]) and does not
-work anyway for records which are altered (in particular, those
-which are expanded may be effectively deleted and re-added behind
-the traversal).
-
-2.2.1 <traverse-Proposed-Solution>Proposed Solution
-
-Abandon the guarantee. You will see every record if no changes
-occur during your traversal, otherwise you will see some subset.
-You can prevent changes by using a transaction or the locking
-API.
-
-2.2.2 Status
-
-Complete. Delete-during-traverse will still delete every record,
-too (assuming no other changes).
-
-2.3 Nesting of Transactions Is Fraught
-
-TDB has alternated between allowing nested transactions and not
-allowing them. Various paths in the Samba codebase assume that
-transactions will nest, and in a sense they can: the operation is
-only committed to disk when the outer transaction is committed.
-There are two problems, however:
-
-1. Canceling the inner transaction will cause the outer
-  transaction commit to fail, and will not undo any operations
-  since the inner transaction began. This problem is soluble with
-  some additional internal code.
-
-2. An inner transaction commit can be cancelled by the outer
-  transaction. This is desirable in the way which Samba's
-  database initialization code uses transactions, but could be a
-  surprise to any users expecting a successful transaction commit
-  to expose changes to others.
-
-The current solution is to specify the behavior at tdb_open(),
-with the default currently that nested transactions are allowed.
-This flag can also be changed at runtime.
-
-2.3.1 Proposed Solution
-
-Given the usage patterns, it seems that the “least-surprise”
-behavior of disallowing nested transactions should become the
-default. Additionally, it seems the outer transaction is the only
-code which knows whether inner transactions should be allowed, so
-a flag to indicate this could be added to tdb_transaction_start.
-However, this behavior can be simulated with a wrapper which uses
-tdb_add_flags() and tdb_remove_flags(), so the API should not be
-expanded for this relatively-obscure case.
-
-2.3.2 Status
-
-Incomplete; nesting flag is still defined as per tdb1.
-
-2.4 Incorrect Hash Function is Not Detected
-
-tdb_open_ex() allows the calling code to specify a different hash
-function to use, but does not check that all other processes
-accessing this tdb are using the same hash function. The result
-is that records are missing from tdb_fetch().
-
-2.4.1 Proposed Solution
-
-The header should contain an example hash result (eg. the hash of
-0xdeadbeef), and tdb_open_ex() should check that the given hash
-function produces the same answer, or fail the tdb_open call.
-
-2.4.2 Status
-
-Complete.
-
-2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-
-In response to scalability issues with the free list ([TDB-Freelist-Is]
-) two API workarounds have been incorporated in TDB:
-tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
-latter actually calls the former with an argument of “5”.
-
-This code allows deleted records to accumulate without putting
-them in the free list. On delete we iterate through each chain
-and free them in a batch if there are more than max_dead entries.
-These are never otherwise recycled except as a side-effect of a
-tdb_repack.
-
-2.5.1 Proposed Solution
-
-With the scalability problems of the freelist solved, this API
-can be removed. The TDB_VOLATILE flag may still be useful as a
-hint that store and delete of records will be at least as common
-as fetch in order to allow some internal tuning, but initially
-will become a no-op.
-
-2.5.2 Status
-
-Incomplete. TDB_VOLATILE still defined, but implementation should
-fail on unknown flags to be future-proof.
-
-2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
-  In The Same Process
-
-No process can open the same TDB twice; we check and disallow it.
-This is an unfortunate side-effect of fcntl locks, which operate
-on a per-file rather than per-file-descriptor basis, and do not
-nest. Thus, closing any file descriptor on a file clears all the
-locks obtained by this process, even if they were placed using a
-different file descriptor!
-
-Note that even if this were solved, deadlock could occur if
-operations were nested: this is a more manageable programming
-error in most cases.
-
-2.6.1 Proposed Solution
-
-We could lobby POSIX to fix the perverse rules, or at least lobby
-Linux to violate them so that the most common implementation does
-not have this restriction. This would be a generally good idea
-for other fcntl lock users.
-
-Samba uses a wrapper which hands out the same tdb_context to
-multiple callers if this happens, and does simple reference
-counting. We should do this inside the tdb library, which already
-emulates lock nesting internally; it would need to recognize when
-deadlock occurs within a single process. This would create a new
-failure mode for tdb operations (while we currently handle
-locking failures, they are impossible in normal use and a process
-encountering them can do little but give up).
-
-I do not see benefit in an additional tdb_open flag to indicate
-whether re-opening is allowed, as though there may be some
-benefit to adding a call to detect when a tdb_context is shared,
-to allow other to create such an API.
-
-2.6.2 Status
-
-Incomplete.
-
-2.7 TDB API Is Not POSIX Thread-safe
-
-The TDB API uses an error code which can be queried after an
-operation to determine what went wrong. This programming model
-does not work with threads, unless specific additional guarantees
-are given by the implementation. In addition, even
-otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
-).
-
-2.7.1 Proposed Solution
-
-Reachitecting the API to include a tdb_errcode pointer would be a
-great deal of churn; we are better to guarantee that the
-tdb_errcode is per-thread so the current programming model can be
-maintained.
-
-This requires dynamic per-thread allocations, which is awkward
-with POSIX threads (pthread_key_create space is limited and we
-cannot simply allocate a key for every TDB).
-
-Internal locking is required to make sure that fcntl locks do not
-overlap between threads, and also that the global list of tdbs is
-maintained.
-
-The aim is that building tdb with -DTDB_PTHREAD will result in a
-pthread-safe version of the library, and otherwise no overhead
-will exist. Alternatively, a hooking mechanism similar to that
-proposed for [Proposed-Solution-locking-hook] could be used to
-enable pthread locking at runtime.
-
-2.7.2 Status
-
-Incomplete.
-
-2.8 *_nonblock Functions And *_mark Functions Expose
-  Implementation
-
-CTDB[footnote:
-Clustered TDB, see http://ctdb.samba.org
-] wishes to operate on TDB in a non-blocking manner. This is
-currently done as follows:
-
-1. Call the _nonblock variant of an API function (eg.
-  tdb_lockall_nonblock). If this fails:
-
-2. Fork a child process, and wait for it to call the normal
-  variant (eg. tdb_lockall).
-
-3. If the child succeeds, call the _mark variant to indicate we
-  already have the locks (eg. tdb_lockall_mark).
-
-4. Upon completion, tell the child to release the locks (eg.
-  tdb_unlockall).
-
-5. Indicate to tdb that it should consider the locks removed (eg.
-  tdb_unlockall_mark).
-
-There are several issues with this approach. Firstly, adding two
-new variants of each function clutters the API for an obscure
-use, and so not all functions have three variants. Secondly, it
-assumes that all paths of the functions ask for the same locks,
-otherwise the parent process will have to get a lock which the
-child doesn't have under some circumstances. I don't believe this
-is currently the case, but it constrains the implementation.
-
-2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
-
-Implement a hook for locking methods, so that the caller can
-control the calls to create and remove fcntl locks. In this
-scenario, ctdbd would operate as follows:
-
-1. Call the normal API function, eg tdb_lockall().
-
-2. When the lock callback comes in, check if the child has the
-  lock. Initially, this is always false. If so, return 0.
-  Otherwise, try to obtain it in non-blocking mode. If that
-  fails, return EWOULDBLOCK.
-
-3. Release locks in the unlock callback as normal.
-
-4. If tdb_lockall() fails, see if we recorded a lock failure; if
-  so, call the child to repeat the operation.
-
-5. The child records what locks it obtains, and returns that
-  information to the parent.
-
-6. When the child has succeeded, goto 1.
-
-This is flexible enough to handle any potential locking scenario,
-even when lock requirements change. It can be optimized so that
-the parent does not release locks, just tells the child which
-locks it doesn't need to obtain.
-
-It also keeps the complexity out of the API, and in ctdbd where
-it is needed.
-
-2.8.2 Status
-
-Incomplete.
-
-2.9 tdb_chainlock Functions Expose Implementation
-
-tdb_chainlock locks some number of records, including the record
-indicated by the given key. This gave atomicity guarantees;
-no-one can start a transaction, alter, read or delete that key
-while the lock is held.
-
-It also makes the same guarantee for any other key in the chain,
-which is an internal implementation detail and potentially a
-cause for deadlock.
-
-2.9.1 Proposed Solution
-
-None. It would be nice to have an explicit single entry lock
-which effected no other keys. Unfortunately, this won't work for
-an entry which doesn't exist. Thus while chainlock may be
-implemented more efficiently for the existing case, it will still
-have overlap issues with the non-existing case. So it is best to
-keep the current (lack of) guarantee about which records will be
-effected to avoid constraining our implementation.
-
-2.10 Signal Handling is Not Race-Free
-
-The tdb_setalarm_sigptr() call allows the caller's signal handler
-to indicate that the tdb locking code should return with a
-failure, rather than trying again when a signal is received (and
-errno == EAGAIN). This is usually used to implement timeouts.
-
-Unfortunately, this does not work in the case where the signal is
-received before the tdb code enters the fcntl() call to place the
-lock: the code will sleep within the fcntl() code, unaware that
-the signal wants it to exit. In the case of long timeouts, this
-does not happen in practice.
-
-2.10.1 Proposed Solution
-
-The locking hooks proposed in[Proposed-Solution-locking-hook]
-would allow the user to decide on whether to fail the lock
-acquisition on a signal. This allows the caller to choose their
-own compromise: they could narrow the race by checking
-immediately before the fcntl call.[footnote:
-It may be possible to make this race-free in some implementations
-by having the signal handler alter the struct flock to make it
-invalid. This will cause the fcntl() lock call to fail with
-EINVAL if the signal occurs before the kernel is entered,
-otherwise EAGAIN.
-]
-
-2.10.2 Status
-
-Incomplete.
-
-2.11 The API Uses Gratuitous Typedefs, Capitals
-
-typedefs are useful for providing source compatibility when types
-can differ across implementations, or arguably in the case of
-function pointer definitions which are hard for humans to parse.
-Otherwise it is simply obfuscation and pollutes the namespace.
-
-Capitalization is usually reserved for compile-time constants and
-macros.
-
-  TDB_CONTEXT There is no reason to use this over 'struct
-  tdb_context'; the definition isn't visible to the API user
-  anyway.
-
-  TDB_DATA There is no reason to use this over struct TDB_DATA;
-  the struct needs to be understood by the API user.
-
-  struct TDB_DATA This would normally be called 'struct
-  tdb_data'.
-
-  enum TDB_ERROR Similarly, this would normally be enum
-  tdb_error.
-
-2.11.1 Proposed Solution
-
-None. Introducing lower case variants would please pedants like
-myself, but if it were done the existing ones should be kept.
-There is little point forcing a purely cosmetic change upon tdb
-users.
-
-2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
-  Private Pointer
-
-For API compatibility reasons, the logging function needs to call
-tdb_get_logging_private() to retrieve the pointer registered by
-the tdb_open_ex for logging.
-
-2.12.1 Proposed Solution
-
-It should simply take an extra argument, since we are prepared to
-break the API/ABI.
-
-2.12.2 Status
-
-Complete.
-
-2.13 Various Callback Functions Are Not Typesafe
-
-The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
-and tdb_check all take void * and must internally convert it to
-the argument type they were expecting.
-
-If this type changes, the compiler will not produce warnings on
-the callers, since it only sees void *.
-
-2.13.1 Proposed Solution
-
-With careful use of macros, we can create callback functions
-which give a warning when used on gcc and the types of the
-callback and its private argument differ. Unsupported compilers
-will not give a warning, which is no worse than now. In addition,
-the callbacks become clearer, as they need not use void * for
-their parameter.
-
-See CCAN's typesafe_cb module at
-http://ccan.ozlabs.org/info/typesafe_cb.html
-
-2.13.2 Status
-
-Incomplete.
-
-2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
-  tdb_reopen_all Problematic
-
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
-file should be cleared if the caller discovers it is the only
-process with the TDB open. However, if any caller does not
-specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
-the TDB erased underneath them (usually resulting in a crash).
-
-There is a similar issue on fork(); if the parent exits (or
-otherwise closes the tdb) before the child calls tdb_reopen_all()
-to establish the lock used to indicate the TDB is opened by
-someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
-it alone has opened the TDB and will erase it.
-
-2.14.1 Proposed Solution
-
-Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
-see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
-
-2.14.2 Status
-
-Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
-implemented.
-
-2.15 Extending The Header Is Difficult
-
-We have reserved (zeroed) words in the TDB header, which can be
-used for future features. If the future features are compulsory,
-the version number must be updated to prevent old code from
-accessing the database. But if the future feature is optional, we
-have no way of telling if older code is accessing the database or
-not.
-
-2.15.1 Proposed Solution
-
-The header should contain a “format variant” value (64-bit). This
-is divided into two 32-bit parts:
-
-1. The lower part reflects the format variant understood by code
-  accessing the database.
-
-2. The upper part reflects the format variant you must understand
-  to write to the database (otherwise you can only open for
-  reading).
-
-The latter field can only be written at creation time, the former
-should be written under the OPEN_LOCK when opening the database
-for writing, if the variant of the code is lower than the current
-lowest variant.
-
-This should allow backwards-compatible features to be added, and
-detection if older code (which doesn't understand the feature)
-writes to the database.
-
-2.15.2 Status
-
-Incomplete.
-
-2.16 Record Headers Are Not Expandible
-
-If we later want to add (say) checksums on keys and data, it
-would require another format change, which we'd like to avoid.
-
-2.16.1 Proposed Solution
-
-We often have extra padding at the tail of a record. If we ensure
-that the first byte (if any) of this padding is zero, we will
-have a way for future changes to detect code which doesn't
-understand a new format: the new code would write (say) a 1 at
-the tail, and thus if there is no tail or the first byte is 0, we
-would know the extension is not present on that record.
-
-2.16.2 Status
-
-Incomplete.
-
-2.17 TDB Does Not Use Talloc
-
-Many users of TDB (particularly Samba) use the talloc allocator,
-and thus have to wrap TDB in a talloc context to use it
-conveniently.
-
-2.17.1 Proposed Solution
-
-The allocation within TDB is not complicated enough to justify
-the use of talloc, and I am reluctant to force another
-(excellent) library on TDB users. Nonetheless a compromise is
-possible. An attribute (see [attributes]) can be added later to
-tdb_open() to provide an alternate allocation mechanism,
-specifically for talloc but usable by any other allocator (which
-would ignore the “context” argument).
-
-This would form a talloc heirarchy as expected, but the caller
-would still have to attach a destructor to the tdb context
-returned from tdb_open to close it. All TDB_DATA fields would be
-children of the tdb_context, and the caller would still have to
-manage them (using talloc_free() or talloc_steal()).
-
-2.17.2 Status
-
-Deferred.
-
-3 Performance And Scalability Issues
-
-3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
-  Imposes Performance Penalty
-
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
-placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
-never conflict in normal tdb usage, they do add substantial
-overhead for most fcntl lock implementations when the kernel
-scans to detect if a lock conflict exists. This is often a single
-linked list, making the time to acquire and release a fcntl lock
-O(N) where N is the number of processes with the TDB open, not
-the number actually doing work.
-
-In a Samba server it is common to have huge numbers of clients
-sitting idle, and thus they have weaned themselves off the
-TDB_CLEAR_IF_FIRST flag.[footnote:
-There is a flag to tdb_reopen_all() which is used for this
-optimization: if the parent process will outlive the child, the
-child does not need the ACTIVE_LOCK. This is a workaround for
-this very performance issue.
-]
-
-3.1.1 Proposed Solution
-
-Remove the flag. It was a neat idea, but even trivial servers
-tend to know when they are initializing for the first time and
-can simply unlink the old tdb at that point.
-
-3.1.2 Status
-
-Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
-
-3.2 TDB Files Have a 4G Limit
-
-This seems to be becoming an issue (so much for “trivial”!),
-particularly for ldb.
-
-3.2.1 Proposed Solution
-
-A new, incompatible TDB format which uses 64 bit offsets
-internally rather than 32 bit as now. For simplicity of endian
-conversion (which TDB does on the fly if required), all values
-will be 64 bit on disk. In practice, some upper bits may be used
-for other purposes, but at least 56 bits will be available for
-file offsets.
-
-tdb_open() will automatically detect the old version, and even
-create them if TDB_VERSION6 is specified to tdb_open.
-
-32 bit processes will still be able to access TDBs larger than 4G
-(assuming that their off_t allows them to seek to 64 bits), they
-will gracefully fall back as they fail to mmap. This can happen
-already with large TDBs.
-
-Old versions of tdb will fail to open the new TDB files (since 28
-August 2009, commit 398d0c29290: prior to that any unrecognized
-file format would be erased and initialized as a fresh tdb!)
-
-3.2.2 Status
-
-Complete.
-
-3.3 TDB Records Have a 4G Limit
-
-This has not been a reported problem, and the API uses size_t
-which can be 64 bit on 64 bit platforms. However, other limits
-may have made such an issue moot.
-
-3.3.1 Proposed Solution
-
-Record sizes will be 64 bit, with an error returned on 32 bit
-platforms which try to access such records (the current
-implementation would return TDB_ERR_OOM in a similar case). It
-seems unlikely that 32 bit keys will be a limitation, so the
-implementation may not support this (see [sub:Records-Incur-A]).
-
-3.3.2 Status
-
-Complete.
-
-3.4 Hash Size Is Determined At TDB Creation Time
-
-TDB contains a number of hash chains in the header; the number is
-specified at creation time, and defaults to 131. This is such a
-bottleneck on large databases (as each hash chain gets quite
-long), that LDB uses 10,000 for this hash. In general it is
-impossible to know what the 'right' answer is at database
-creation time.
-
-3.4.1 <sub:Hash-Size-Solution>Proposed Solution
-
-After comprehensive performance testing on various scalable hash
-variants[footnote:
-http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
-This was annoying because I was previously convinced that an
-expanding tree of hashes would be very close to optimal.
-], it became clear that it is hard to beat a straight linear hash
-table which doubles in size when it reaches saturation.
-Unfortunately, altering the hash table introduces serious locking
-complications: the entire hash table needs to be locked to
-enlarge the hash table, and others might be holding locks.
-Particularly insidious are insertions done under tdb_chainlock.
-
-Thus an expanding layered hash will be used: an array of hash
-groups, with each hash group exploding into pointers to lower
-hash groups once it fills, turning into a hash tree. This has
-implications for locking: we must lock the entire group in case
-we need to expand it, yet we don't know how deep the tree is at
-that point.
-
-Note that bits from the hash table entries should be stolen to
-hold more hash bits to reduce the penalty of collisions. We can
-use the otherwise-unused lower 3 bits. If we limit the size of
-the database to 64 exabytes, we can use the top 8 bits of the
-hash entry as well. These 11 bits would reduce false positives
-down to 1 in 2000 which is more than we need: we can use one of
-the bits to indicate that the extra hash bits are valid. This
-means we can choose not to re-hash all entries when we expand a
-hash group; simply use the next bits we need and mark them
-invalid.
-
-3.4.2 Status
-
-Complete.
-
-3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
-
-TDB uses a single linked list for the free list. Allocation
-occurs as follows, using heuristics which have evolved over time:
-
-1. Get the free list lock for this whole operation.
-
-2. Multiply length by 1.25, so we always over-allocate by 25%.
-
-3. Set the slack multiplier to 1.
-
-4. Examine the current freelist entry: if it is > length but <
-  the current best case, remember it as the best case.
-
-5. Multiply the slack multiplier by 1.05.
-
-6. If our best fit so far is less than length * slack multiplier,
-  return it. The slack will be turned into a new free record if
-  it's large enough.
-
-7. Otherwise, go onto the next freelist entry.
-
-Deleting a record occurs as follows:
-
-1. Lock the hash chain for this whole operation.
-
-2. Walk the chain to find the record, keeping the prev pointer
-  offset.
-
-3. If max_dead is non-zero:
-
-  (a) Walk the hash chain again and count the dead records.
-
-  (b) If it's more than max_dead, bulk free all the dead ones
-    (similar to steps 4 and below, but the lock is only obtained
-    once).
-
-  (c) Simply mark this record as dead and return.
-
-4. Get the free list lock for the remainder of this operation.
-
-5. <right-merging>Examine the following block to see if it is
-  free; if so, enlarge the current block and remove that block
-  from the free list. This was disabled, as removal from the free
-  list was O(entries-in-free-list).
-
-6. Examine the preceeding block to see if it is free: for this
-  reason, each block has a 32-bit tailer which indicates its
-  length. If it is free, expand it to cover our new block and
-  return.
-
-7. Otherwise, prepend ourselves to the free list.
-
-Disabling right-merging (step [right-merging]) causes
-fragmentation; the other heuristics proved insufficient to
-address this, so the final answer to this was that when we expand
-the TDB file inside a transaction commit, we repack the entire
-tdb.
-
-The single list lock limits our allocation rate; due to the other
-issues this is not currently seen as a bottleneck.
-
-3.5.1 Proposed Solution
-
-The first step is to remove all the current heuristics, as they
-obviously interact, then examine them once the lock contention is
-addressed.
-
-The free list must be split to reduce contention. Assuming
-perfect free merging, we can at most have 1 free list entry for
-each entry. This implies that the number of free lists is related
-to the size of the hash table, but as it is rare to walk a large
-number of free list entries we can use far fewer, say 1/32 of the
-number of hash buckets.
-
-It seems tempting to try to reuse the hash implementation which
-we use for records here, but we have two ways of searching for
-free entries: for allocation we search by size (and possibly
-zone) which produces too many clashes for our hash table to
-handle well, and for coalescing we search by address. Thus an
-array of doubly-linked free lists seems preferable.
-
-There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
-) but it's not clear this would reduce contention in the common
-case where all processes are allocating/freeing the same size.
-Thus we almost certainly need to divide in other ways: the most
-obvious is to divide the file into zones, and using a free list
-(or table of free lists) for each. This approximates address
-ordering.
-
-Unfortunately it is difficult to know what heuristics should be
-used to determine zone sizes, and our transaction code relies on
-being able to create a “recovery area” by simply appending to the
-file (difficult if it would need to create a new zone header).
-Thus we use a linked-list of free tables; currently we only ever
-create one, but if there is more than one we choose one at random
-to use. In future we may use heuristics to add new free tables on
-contention. We only expand the file when all free tables are
-exhausted.
-
-The basic algorithm is as follows. Freeing is simple:
-
-1. Identify the correct free list.
-
-2. Lock the corresponding list.
-
-3. Re-check the list (we didn't have a lock, sizes could have
-  changed): relock if necessary.
-
-4. Place the freed entry in the list.
-
-Allocation is a little more complicated, as we perform delayed
-coalescing at this point:
-
-1. Pick a free table; usually the previous one.
-
-2. Lock the corresponding list.
-
-3. If the top entry is -large enough, remove it from the list and
-  return it.
-
-4. Otherwise, coalesce entries in the list.If there was no entry
-  large enough, unlock the list and try the next largest list
-
-5. If no list has an entry which meets our needs, try the next
-  free table.
-
-6. If no zone satisfies, expand the file.
-
-This optimizes rapid insert/delete of free list entries by not
-coalescing them all the time.. First-fit address ordering
-ordering seems to be fairly good for keeping fragmentation low
-(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
-does not need a tailer to coalesce, though if we needed one we
-could have one cheaply: see [sub:Records-Incur-A].
-
-Each free entry has the free table number in the header: less
-than 255. It also contains a doubly-linked list for easy
-deletion.
-
-3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
-
-Much of this is a result of allocation strategy[footnote:
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
-ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
-] and deliberate hobbling of coalescing; internal fragmentation
-(aka overallocation) is deliberately set at 25%, and external
-fragmentation is only cured by the decision to repack the entire
-db when a transaction commit needs to enlarge the file.
-
-3.6.1 Proposed Solution
-
-The 25% overhead on allocation works in practice for ldb because
-indexes tend to expand by one record at a time. This internal
-fragmentation can be resolved by having an “expanded” bit in the
-header to note entries that have previously expanded, and
-allocating more space for them.
-
-There are is a spectrum of possible solutions for external
-fragmentation: one is to use a fragmentation-avoiding allocation
-strategy such as best-fit address-order allocator. The other end
-of the spectrum would be to use a bump allocator (very fast and
-simple) and simply repack the file when we reach the end.
-
-There are three problems with efficient fragmentation-avoiding
-allocators: they are non-trivial, they tend to use a single free
-list for each size, and there's no evidence that tdb allocation
-patterns will match those recorded for general allocators (though
-it seems likely).
-
-Thus we don't spend too much effort on external fragmentation; we
-will be no worse than the current code if we need to repack on
-occasion. More effort is spent on reducing freelist contention,
-and reducing overhead.
-
-3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
-
-Each TDB record has a header as follows:
-
-struct tdb_record {
-
-        tdb_off_t next; /* offset of the next record in the list
-*/
-
-        tdb_len_t rec_len; /* total byte length of record */
-
-        tdb_len_t key_len; /* byte length of key */
-
-        tdb_len_t data_len; /* byte length of data */
-
-        uint32_t full_hash; /* the full 32 bit hash of the key */
-
-        uint32_t magic;   /* try to catch errors */
-
-        /* the following union is implied:
-
-                union {
-
-                        char record[rec_len];
-
-                        struct {
-
-                                char key[key_len];
-
-                                char data[data_len];
-
-                        }
-
-                        uint32_t totalsize; (tailer)
-
-                }
-
-        */
-
-};
-
-Naively, this would double to a 56-byte overhead on a 64 bit
-implementation.
-
-3.7.1 Proposed Solution
-
-We can use various techniques to reduce this for an allocated
-block:
-
-1. The 'next' pointer is not required, as we are using a flat
-  hash table.
-
-2. 'rec_len' can instead be expressed as an addition to key_len
-  and data_len (it accounts for wasted or overallocated length in
-  the record). Since the record length is always a multiple of 8,
-  we can conveniently fit it in 32 bits (representing up to 35
-  bits).
-
-3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
-  restrict 'data_len' to 32 bits, but instead we can combine the
-  two into one 64-bit field and using a 5 bit value which
-  indicates at what bit to divide the two. Keys are unlikely to
-  scale as fast as data, so I'm assuming a maximum key size of 32
-  bits.
-
-4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
-  this is diminishing returns after a handful of bits (at 10
-  bits, it reduces 99.9% of false memcmp). As an aside, as the
-  lower bits are already incorporated in the hash table
-  resolution, the upper bits should be used here. Note that it's
-  not clear that these bits will be a win, given the extra bits
-  in the hash table itself (see [sub:Hash-Size-Solution]).
-
-5. 'magic' does not need to be enlarged: it currently reflects
-  one of 5 values (used, free, dead, recovery, and
-  unused_recovery). It is useful for quick sanity checking
-  however, and should not be eliminated.
-
-6. 'tailer' is only used to coalesce free blocks (so a block to
-  the right can find the header to check if this block is free).
-  This can be replaced by a single 'free' bit in the header of
-  the following block (and the tailer only exists in free
-  blocks).[footnote:
-This technique from Thomas Standish. Data Structure Techniques.
-Addison-Wesley, Reading, Massachusetts, 1980.
-] The current proposed coalescing algorithm doesn't need this,
-  however.
-
-This produces a 16 byte used header like this:
-
-struct tdb_used_record {
-
-        uint32_t used_magic : 16,
-
-
-
-                 key_data_divide: 5,
-
-                 top_hash: 11;
-
-        uint32_t extra_octets;
-
-        uint64_t key_and_data_len;
-
-};
-
-And a free record like this:
-
-struct tdb_free_record {
-
-        uint64_t free_magic: 8,
-
-                   prev : 56;
-
-
-
-        uint64_t free_table: 8,
-
-                 total_length : 56
-
-        uint64_t next;;
-
-};
-
-Note that by limiting valid offsets to 56 bits, we can pack
-everything we need into 3 64-byte words, meaning our minimum
-record size is 8 bytes.
-
-3.7.2 Status
-
-Complete.
-
-3.8 Transaction Commit Requires 4 fdatasync
-
-The current transaction algorithm is:
-
-1. write_recovery_data();
-
-2. sync();
-
-3. write_recovery_header();
-
-4. sync();
-
-5. overwrite_with_new_data();
-
-6. sync();
-
-7. remove_recovery_header();
-
-8. sync();
-
-On current ext3, each sync flushes all data to disk, so the next
-3 syncs are relatively expensive. But this could become a
-performance bottleneck on other filesystems such as ext4.
-
-3.8.1 Proposed Solution
-
-Neil Brown points out that this is overzealous, and only one sync
-is needed:
-
-1. Bundle the recovery data, a transaction counter and a strong
-  checksum of the new data.
-
-2. Strong checksum that whole bundle.
-
-3. Store the bundle in the database.
-
-4. Overwrite the oldest of the two recovery pointers in the
-  header (identified using the transaction counter) with the
-  offset of this bundle.
-
-5. sync.
-
-6. Write the new data to the file.
-
-Checking for recovery means identifying the latest bundle with a
-valid checksum and using the new data checksum to ensure that it
-has been applied. This is more expensive than the current check,
-but need only be done at open. For running databases, a separate
-header field can be used to indicate a transaction in progress;
-we need only check for recovery if this is set.
-
-3.8.2 Status
-
-Deferred.
-
-3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
-
-3.9.1 Proposed SolutionNone. At some point you say “use a real
-  database” (but see [replay-attribute]).
-
-But as a thought experiment, if we implemented transactions to
-only overwrite free entries (this is tricky: there must not be a
-header in each entry which indicates whether it is free, but use
-of presence in metadata elsewhere), and a pointer to the hash
-table, we could create an entirely new commit without destroying
-existing data. Then it would be easy to implement snapshots in a
-similar way.
-
-This would not allow arbitrary changes to the database, such as
-tdb_repack does, and would require more space (since we have to
-preserve the current and future entries at once). If we used hash
-trees rather than one big hash table, we might only have to
-rewrite some sections of the hash, too.
-
-We could then implement snapshots using a similar method, using
-multiple different hash tables/free tables.
-
-3.9.2 Status
-
-Deferred.
-
-3.10 Transactions Cannot Operate in Parallel
-
-This would be useless for ldb, as it hits the index records with
-just about every update. It would add significant complexity in
-resolving clashes, and cause the all transaction callers to write
-their code to loop in the case where the transactions spuriously
-failed.
-
-3.10.1 Proposed Solution
-
-None (but see [replay-attribute]). We could solve a small part of
-the problem by providing read-only transactions. These would
-allow one write transaction to begin, but it could not commit
-until all r/o transactions are done. This would require a new
-RO_TRANSACTION_LOCK, which would be upgraded on commit.
-
-3.10.2 Status
-
-Deferred.
-
-3.11 Default Hash Function Is Suboptimal
-
-The Knuth-inspired multiplicative hash used by tdb is fairly slow
-(especially if we expand it to 64 bits), and works best when the
-hash bucket size is a prime number (which also means a slow
-modulus). In addition, it is highly predictable which could
-potentially lead to a Denial of Service attack in some TDB uses.
-
-3.11.1 Proposed Solution
-
-The Jenkins lookup3 hash[footnote:
-http://burtleburtle.net/bob/c/lookup3.c
-] is a fast and superbly-mixing hash. It's used by the Linux
-kernel and almost everything else. This has the particular
-properties that it takes an initial seed, and produces two 32 bit
-hash numbers, which we can combine into a 64-bit hash.
-
-The seed should be created at tdb-creation time from some random
-source, and placed in the header. This is far from foolproof, but
-adds a little bit of protection against hash bombing.
-
-3.11.2 Status
-
-Complete.
-
-3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
-
-We lock a record during traversal iteration, and try to grab that
-lock in the delete code. If that grab on delete fails, we simply
-mark it deleted and continue onwards; traversal checks for this
-condition and does the delete when it moves off the record.
-
-If traversal terminates, the dead record may be left
-indefinitely.
-
-3.12.1 Proposed Solution
-
-Remove reliability guarantees; see [traverse-Proposed-Solution].
-
-3.12.2 Status
-
-Complete.
-
-3.13 Fcntl Locking Adds Overhead
-
-Placing a fcntl lock means a system call, as does removing one.
-This is actually one reason why transactions can be faster
-(everything is locked once at transaction start). In the
-uncontended case, this overhead can theoretically be eliminated.
-
-3.13.1 Proposed Solution
-
-None.
-
-We tried this before with spinlock support, in the early days of
-TDB, and it didn't make much difference except in manufactured
-benchmarks.
-
-We could use spinlocks (with futex kernel support under Linux),
-but it means that we lose automatic cleanup when a process dies
-with a lock. There is a method of auto-cleanup under Linux, but
-it's not supported by other operating systems. We could
-reintroduce a clear-if-first-style lock and sweep for dead
-futexes on open, but that wouldn't help the normal case of one
-concurrent opener dying. Increasingly elaborate repair schemes
-could be considered, but they require an ABI change (everyone
-must use them) anyway, so there's no need to do this at the same
-time as everything else.
-
-3.14 Some Transactions Don't Require Durability
-
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
-normal (fast) usage, and occasionally empties the results into a
-transactional TDB. This kind of usage prioritizes performance
-over durability: as long as we are consistent, data can be lost.
-
-This would be more neatly implemented inside tdb: a “soft”
-transaction commit (ie. syncless) which meant that data may be
-reverted on a crash.
-
-3.14.1 Proposed Solution
-
-None.
-
-Unfortunately any transaction scheme which overwrites old data
-requires a sync before that overwrite to avoid the possibility of
-corruption.
-
-It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
-,where transactions are committed without overwriting existing
-data, and an array of top-level pointers were available in the
-header. If the transaction is “soft” then we would not need a
-sync at all: existing processes would pick up the new hash table
-and free list and work with that.
-
-At some later point, a sync would allow recovery of the old data
-into the free lists (perhaps when the array of top-level pointers
-filled). On crash, tdb_open() would examine the array of top
-levels, and apply the transactions until it encountered an
-invalid checksum.
-
-3.15 Tracing Is Fragile, Replay Is External
-
-The current TDB has compile-time-enabled tracing code, but it
-often breaks as it is not enabled by default. In a similar way,
-the ctdb code has an external wrapper which does replay tracing
-so it can coordinate cluster-wide transactions.
-
-3.15.1 Proposed Solution<replay-attribute>
-
-Tridge points out that an attribute can be later added to
-tdb_open (see [attributes]) to provide replay/trace hooks, which
-could become the basis for this and future parallel transactions
-and snapshot support.
-
-3.15.2 Status
-
-Deferred.
diff --git a/lib/tdb2/free.c b/lib/tdb2/free.c
deleted file mode 100644
index c4015a0f2a..0000000000
--- a/lib/tdb2/free.c
+++ /dev/null
@@ -1,976 +0,0 @@
- /*
-   Trivial Database 2: free list/block handling
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-#include <ccan/ilog/ilog.h>
-#include <time.h>
-#include <assert.h>
-#include <limits.h>
-
-static unsigned fls64(uint64_t val)
-{
-	return ilog64(val);
-}
-
-/* In which bucket would we find a particular record size? (ignoring header) */
-unsigned int size_to_bucket(tdb_len_t data_len)
-{
-	unsigned int bucket;
-
-	/* We can't have records smaller than this. */
-	assert(data_len >= TDB_MIN_DATA_LEN);
-
-	/* Ignoring the header... */
-	if (data_len - TDB_MIN_DATA_LEN <= 64) {
-		/* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
-		bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
-	} else {
-		/* After that we go power of 2. */
-		bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
-	}
-
-	if (unlikely(bucket >= TDB_FREE_BUCKETS))
-		bucket = TDB_FREE_BUCKETS - 1;
-	return bucket;
-}
-
-tdb_off_t first_ftable(struct tdb_context *tdb)
-{
-	return tdb_read_off(tdb, offsetof(struct tdb_header, free_table));
-}
-
-tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable)
-{
-	return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next));
-}
-
-enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb)
-{
-	/* Use reservoir sampling algorithm to select a free list at random. */
-	unsigned int rnd, max = 0, count = 0;
-	tdb_off_t off;
-
-	tdb->ftable_off = off = first_ftable(tdb);
-	tdb->ftable = 0;
-
-	while (off) {
-		if (TDB_OFF_IS_ERR(off)) {
-			return TDB_OFF_TO_ERR(off);
-		}
-
-		rnd = random();
-		if (rnd >= max) {
-			tdb->ftable_off = off;
-			tdb->ftable = count;
-			max = rnd;
-		}
-
-		off = next_ftable(tdb, off);
-		count++;
-	}
-	return TDB_SUCCESS;
-}
-
-/* Offset of a given bucket. */
-tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket)
-{
-	return ftable_off + offsetof(struct tdb_freetable, buckets)
-		+ bucket * sizeof(tdb_off_t);
-}
-
-/* Returns free_buckets + 1, or list number to search, or -ve error. */
-static tdb_off_t find_free_head(struct tdb_context *tdb,
-				tdb_off_t ftable_off,
-				tdb_off_t bucket)
-{
-	/* Speculatively search for a non-zero bucket. */
-	return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0),
-				    bucket, TDB_FREE_BUCKETS);
-}
-
-static void check_list(struct tdb_context *tdb, tdb_off_t b_off)
-{
-#ifdef CCAN_TDB2_DEBUG
-	tdb_off_t off, prev = 0, first;
-	struct tdb_free_record r;
-
-	first = off = (tdb_read_off(tdb, b_off) & TDB_OFF_MASK);
-	while (off != 0) {
-		tdb_read_convert(tdb, off, &r, sizeof(r));
-		if (frec_magic(&r) != TDB_FREE_MAGIC)
-			abort();
-		if (prev && frec_prev(&r) != prev)
-			abort();
-		prev = off;
-		off = r.next;
-	}
-
-	if (first) {
-		tdb_read_convert(tdb, first, &r, sizeof(r));
-		if (frec_prev(&r) != prev)
-			abort();
-	}
-#endif
-}
-
-/* Remove from free bucket. */
-static enum TDB_ERROR remove_from_list(struct tdb_context *tdb,
-				       tdb_off_t b_off, tdb_off_t r_off,
-				       const struct tdb_free_record *r)
-{
-	tdb_off_t off, prev_next, head;
-	enum TDB_ERROR ecode;
-
-	/* Is this only element in list?  Zero out bucket, and we're done. */
-	if (frec_prev(r) == r_off)
-		return tdb_write_off(tdb, b_off, 0);
-
-	/* off = &r->prev->next */
-	off = frec_prev(r) + offsetof(struct tdb_free_record, next);
-
-	/* Get prev->next */
-	prev_next = tdb_read_off(tdb, off);
-	if (TDB_OFF_IS_ERR(prev_next))
-		return TDB_OFF_TO_ERR(prev_next);
-
-	/* If prev->next == 0, we were head: update bucket to point to next. */
-	if (prev_next == 0) {
-		/* We must preserve upper bits. */
-		head = tdb_read_off(tdb, b_off);
-		if (TDB_OFF_IS_ERR(head))
-			return TDB_OFF_TO_ERR(head);
-
-		if ((head & TDB_OFF_MASK) != r_off) {
-			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-					  "remove_from_list:"
-					  " %llu head %llu on list %llu",
-					  (long long)r_off,
-					  (long long)head,
-					  (long long)b_off);
-		}
-		head = ((head & ~TDB_OFF_MASK) | r->next);
-		ecode = tdb_write_off(tdb, b_off, head);
-		if (ecode != TDB_SUCCESS)
-			return ecode;
-	} else {
-		/* r->prev->next = r->next */
-		ecode = tdb_write_off(tdb, off, r->next);
-		if (ecode != TDB_SUCCESS)
-			return ecode;
-	}
-
-	/* If we were the tail, off = &head->prev. */
-	if (r->next == 0) {
-		head = tdb_read_off(tdb, b_off);
-		if (TDB_OFF_IS_ERR(head))
-			return TDB_OFF_TO_ERR(head);
-		head &= TDB_OFF_MASK;
-		off = head + offsetof(struct tdb_free_record, magic_and_prev);
-	} else {
-		/* off = &r->next->prev */
-		off = r->next + offsetof(struct tdb_free_record,
-					 magic_and_prev);
-	}
-
-#ifdef CCAN_TDB2_DEBUG
-	/* *off == r */
-	if ((tdb_read_off(tdb, off) & TDB_OFF_MASK) != r_off) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "remove_from_list:"
-				  " %llu bad prev in list %llu",
-				  (long long)r_off, (long long)b_off);
-	}
-#endif
-	/* r->next->prev = r->prev */
-	return tdb_write_off(tdb, off, r->magic_and_prev);
-}
-
-/* Enqueue in this free bucket: sets coalesce if we've added 128
- * entries to it. */
-static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb,
-				      tdb_off_t b_off,
-				      tdb_off_t off,
-				      tdb_len_t len,
-				      bool *coalesce)
-{
-	struct tdb_free_record new;
-	enum TDB_ERROR ecode;
-	tdb_off_t prev, head;
-	uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
-
-	head = tdb_read_off(tdb, b_off);
-	if (TDB_OFF_IS_ERR(head))
-		return TDB_OFF_TO_ERR(head);
-
-	/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
-	new.ftable_and_len = ((uint64_t)tdb->ftable
-			      << (64 - TDB_OFF_UPPER_STEAL))
-		| len;
-
-	/* new->next = head. */
-	new.next = (head & TDB_OFF_MASK);
-
-	/* First element?  Prev points to ourselves. */
-	if (!new.next) {
-		new.magic_and_prev = (magic | off);
-	} else {
-		/* new->prev = next->prev */
-		prev = tdb_read_off(tdb,
-				    new.next + offsetof(struct tdb_free_record,
-							magic_and_prev));
-		new.magic_and_prev = prev;
-		if (frec_magic(&new) != TDB_FREE_MAGIC) {
-			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-					  "enqueue_in_free: %llu bad head"
-					  " prev %llu",
-					  (long long)new.next,
-					  (long long)prev);
-		}
-		/* next->prev = new. */
-		ecode = tdb_write_off(tdb, new.next
-				      + offsetof(struct tdb_free_record,
-						 magic_and_prev),
-				      off | magic);
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-
-#ifdef CCAN_TDB2_DEBUG
-		prev = tdb_read_off(tdb, frec_prev(&new)
-				    + offsetof(struct tdb_free_record, next));
-		if (prev != 0) {
-			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-					  "enqueue_in_free:"
-					  " %llu bad tail next ptr %llu",
-					  (long long)frec_prev(&new)
-					  + offsetof(struct tdb_free_record,
-						     next),
-					  (long long)prev);
-		}
-#endif
-	}
-
-	/* Update enqueue count, but don't set high bit: see TDB_OFF_IS_ERR */
-	if (*coalesce)
-		head += (1ULL << (64 - TDB_OFF_UPPER_STEAL));
-	head &= ~(TDB_OFF_MASK | (1ULL << 63));
-	head |= off;
-
-	ecode = tdb_write_off(tdb, b_off, head);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	/* It's time to coalesce if counter wrapped. */
-	if (*coalesce)
-		*coalesce = ((head & ~TDB_OFF_MASK) == 0);
-
-	return tdb_write_convert(tdb, off, &new, sizeof(new));
-}
-
-static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
-{
-	tdb_off_t off;
-	unsigned int i;
-
-	if (likely(tdb->ftable == ftable))
-		return tdb->ftable_off;
-
-	off = first_ftable(tdb);
-	for (i = 0; i < ftable; i++) {
-		if (TDB_OFF_IS_ERR(off)) {
-			break;
-		}
-		off = next_ftable(tdb, off);
-	}
-	return off;
-}
-
-/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
- * need to blatt the *protect record (which is set to an error). */
-static tdb_len_t coalesce(struct tdb_context *tdb,
-			  tdb_off_t off, tdb_off_t b_off,
-			  tdb_len_t data_len,
-			  tdb_off_t *protect)
-{
-	tdb_off_t end;
-	struct tdb_free_record rec;
-	enum TDB_ERROR ecode;
-
-	tdb->stats.alloc_coalesce_tried++;
-	end = off + sizeof(struct tdb_used_record) + data_len;
-
-	while (end < tdb->file->map_size) {
-		const struct tdb_free_record *r;
-		tdb_off_t nb_off;
-		unsigned ftable, bucket;
-
-		r = tdb_access_read(tdb, end, sizeof(*r), true);
-		if (TDB_PTR_IS_ERR(r)) {
-			ecode = TDB_PTR_ERR(r);
-			goto err;
-		}
-
-		if (frec_magic(r) != TDB_FREE_MAGIC
-		    || frec_ftable(r) == TDB_FTABLE_NONE) {
-			tdb_access_release(tdb, r);
-			break;
-		}
-
-		ftable = frec_ftable(r);
-		bucket = size_to_bucket(frec_len(r));
-		nb_off = ftable_offset(tdb, ftable);
-		if (TDB_OFF_IS_ERR(nb_off)) {
-			tdb_access_release(tdb, r);
-			ecode = TDB_OFF_TO_ERR(nb_off);
-			goto err;
-		}
-		nb_off = bucket_off(nb_off, bucket);
-		tdb_access_release(tdb, r);
-
-		/* We may be violating lock order here, so best effort. */
-		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT)
-		    != TDB_SUCCESS) {
-			tdb->stats.alloc_coalesce_lockfail++;
-			break;
-		}
-
-		/* Now we have lock, re-check. */
-		ecode = tdb_read_convert(tdb, end, &rec, sizeof(rec));
-		if (ecode != TDB_SUCCESS) {
-			tdb_unlock_free_bucket(tdb, nb_off);
-			goto err;
-		}
-
-		if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) {
-			tdb->stats.alloc_coalesce_race++;
-			tdb_unlock_free_bucket(tdb, nb_off);
-			break;
-		}
-
-		if (unlikely(frec_ftable(&rec) != ftable)
-		    || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
-			tdb->stats.alloc_coalesce_race++;
-			tdb_unlock_free_bucket(tdb, nb_off);
-			break;
-		}
-
-		/* Did we just mess up a record you were hoping to use? */
-		if (end == *protect) {
-			tdb->stats.alloc_coalesce_iterate_clash++;
-			*protect = TDB_ERR_TO_OFF(TDB_ERR_NOEXIST);
-		}
-
-		ecode = remove_from_list(tdb, nb_off, end, &rec);
-		check_list(tdb, nb_off);
-		if (ecode != TDB_SUCCESS) {
-			tdb_unlock_free_bucket(tdb, nb_off);
-			goto err;
-		}
-
-		end += sizeof(struct tdb_used_record) + frec_len(&rec);
-		tdb_unlock_free_bucket(tdb, nb_off);
-		tdb->stats.alloc_coalesce_num_merged++;
-	}
-
-	/* Didn't find any adjacent free? */
-	if (end == off + sizeof(struct tdb_used_record) + data_len)
-		return 0;
-
-	/* Before we expand, check this isn't one you wanted protected? */
-	if (off == *protect) {
-		*protect = TDB_ERR_TO_OFF(TDB_ERR_EXISTS);
-		tdb->stats.alloc_coalesce_iterate_clash++;
-	}
-
-	/* OK, expand initial record */
-	ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
-	if (ecode != TDB_SUCCESS) {
-		goto err;
-	}
-
-	if (frec_len(&rec) != data_len) {
-		ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				   "coalesce: expected data len %zu not %zu",
-				   (size_t)data_len, (size_t)frec_len(&rec));
-		goto err;
-	}
-
-	ecode = remove_from_list(tdb, b_off, off, &rec);
-	check_list(tdb, b_off);
-	if (ecode != TDB_SUCCESS) {
-		goto err;
-	}
-
-	/* Try locking violation first.  We don't allow coalesce recursion! */
-	ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT, false);
-	if (ecode != TDB_SUCCESS) {
-		/* Need to drop lock.  Can't rely on anything stable. */
-		tdb->stats.alloc_coalesce_lockfail++;
-		*protect = TDB_ERR_TO_OFF(TDB_ERR_CORRUPT);
-
-		/* We have to drop this to avoid deadlocks, so make sure record
-		 * doesn't get coalesced by someone else! */
-		rec.ftable_and_len = (TDB_FTABLE_NONE
-				      << (64 - TDB_OFF_UPPER_STEAL))
-			| (end - off - sizeof(struct tdb_used_record));
-		ecode = tdb_write_off(tdb,
-				      off + offsetof(struct tdb_free_record,
-						     ftable_and_len),
-				      rec.ftable_and_len);
-		if (ecode != TDB_SUCCESS) {
-			goto err;
-		}
-
-		tdb_unlock_free_bucket(tdb, b_off);
-
-		ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT,
-					false);
-		if (ecode != TDB_SUCCESS) {
-			return TDB_ERR_TO_OFF(ecode);
-		}
-	} else if (TDB_OFF_IS_ERR(*protect)) {
-		/* For simplicity, we always drop lock if they can't continue */
-		tdb_unlock_free_bucket(tdb, b_off);
-	}
-	tdb->stats.alloc_coalesce_succeeded++;
-
-	/* Return usable length. */
-	return end - off - sizeof(struct tdb_used_record);
-
-err:
-	/* To unify error paths, we *always* unlock bucket on error. */
-	tdb_unlock_free_bucket(tdb, b_off);
-	return TDB_ERR_TO_OFF(ecode);
-}
-
-/* List is locked: we unlock it. */
-static enum TDB_ERROR coalesce_list(struct tdb_context *tdb,
-				    tdb_off_t ftable_off,
-				    tdb_off_t b_off,
-				    unsigned int limit)
-{
-	enum TDB_ERROR ecode;
-	tdb_off_t off;
-
-	off = tdb_read_off(tdb, b_off);
-	if (TDB_OFF_IS_ERR(off)) {
-		ecode = TDB_OFF_TO_ERR(off);
-		goto unlock_err;
-	}
-	/* A little bit of paranoia: counter should be 0. */
-	off &= TDB_OFF_MASK;
-
-	while (off && limit--) {
-		struct tdb_free_record rec;
-		tdb_len_t coal;
-		tdb_off_t next;
-
-		ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-
-		next = rec.next;
-		coal = coalesce(tdb, off, b_off, frec_len(&rec), &next);
-		if (TDB_OFF_IS_ERR(coal)) {
-			/* This has already unlocked on error. */
-			return TDB_OFF_TO_ERR(coal);
-		}
-		if (TDB_OFF_IS_ERR(next)) {
-			/* Coalescing had to unlock, so stop. */
-			return TDB_SUCCESS;
-		}
-		/* Keep going if we're doing well... */
-		limit += size_to_bucket(coal / 16 + TDB_MIN_DATA_LEN);
-		off = next;
-	}
-
-	/* Now, move those elements to the tail of the list so we get something
-	 * else next time. */
-	if (off) {
-		struct tdb_free_record oldhrec, newhrec, oldtrec, newtrec;
-		tdb_off_t oldhoff, oldtoff, newtoff;
-
-		/* The record we were up to is the new head. */
-		ecode = tdb_read_convert(tdb, off, &newhrec, sizeof(newhrec));
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-
-		/* Get the new tail. */
-		newtoff = frec_prev(&newhrec);
-		ecode = tdb_read_convert(tdb, newtoff, &newtrec,
-					 sizeof(newtrec));
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-
-		/* Get the old head. */
-		oldhoff = tdb_read_off(tdb, b_off);
-		if (TDB_OFF_IS_ERR(oldhoff)) {
-			ecode = TDB_OFF_TO_ERR(oldhoff);
-			goto unlock_err;
-		}
-
-		/* This could happen if they all coalesced away. */
-		if (oldhoff == off)
-			goto out;
-
-		ecode = tdb_read_convert(tdb, oldhoff, &oldhrec,
-					 sizeof(oldhrec));
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-
-		/* Get the old tail. */
-		oldtoff = frec_prev(&oldhrec);
-		ecode = tdb_read_convert(tdb, oldtoff, &oldtrec,
-					 sizeof(oldtrec));
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-
-		/* Old tail's next points to old head. */
-		oldtrec.next = oldhoff;
-
-		/* Old head's prev points to old tail. */
-		oldhrec.magic_and_prev
-			= (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL))
-			| oldtoff;
-
-		/* New tail's next is 0. */
-		newtrec.next = 0;
-
-		/* Write out the modified versions. */
-		ecode = tdb_write_convert(tdb, oldtoff, &oldtrec,
-					  sizeof(oldtrec));
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-
-		ecode = tdb_write_convert(tdb, oldhoff, &oldhrec,
-					  sizeof(oldhrec));
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-
-		ecode = tdb_write_convert(tdb, newtoff, &newtrec,
-					  sizeof(newtrec));
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-
-		/* And finally link in new head. */
-		ecode = tdb_write_off(tdb, b_off, off);
-		if (ecode != TDB_SUCCESS)
-			goto unlock_err;
-	}
-out:
-	tdb_unlock_free_bucket(tdb, b_off);
-	return TDB_SUCCESS;
-
-unlock_err:
-	tdb_unlock_free_bucket(tdb, b_off);
-	return ecode;
-}
-
-/* List must not be locked if coalesce_ok is set. */
-enum TDB_ERROR add_free_record(struct tdb_context *tdb,
-			       tdb_off_t off, tdb_len_t len_with_header,
-			       enum tdb_lock_flags waitflag,
-			       bool coalesce_ok)
-{
-	tdb_off_t b_off;
-	tdb_len_t len;
-	enum TDB_ERROR ecode;
-
-	assert(len_with_header >= sizeof(struct tdb_free_record));
-
-	len = len_with_header - sizeof(struct tdb_used_record);
-
-	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
-	ecode = tdb_lock_free_bucket(tdb, b_off, waitflag);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	ecode = enqueue_in_free(tdb, b_off, off, len, &coalesce_ok);
-	check_list(tdb, b_off);
-
-	/* Coalescing unlocks free list. */
-	if (!ecode && coalesce_ok)
-		ecode = coalesce_list(tdb, tdb->ftable_off, b_off, 2);
-	else
-		tdb_unlock_free_bucket(tdb, b_off);
-	return ecode;
-}
-
-static size_t adjust_size(size_t keylen, size_t datalen)
-{
-	size_t size = keylen + datalen;
-
-	if (size < TDB_MIN_DATA_LEN)
-		size = TDB_MIN_DATA_LEN;
-
-	/* Round to next uint64_t boundary. */
-	return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
-}
-
-/* If we have enough left over to be useful, split that off. */
-static size_t record_leftover(size_t keylen, size_t datalen,
-			      bool want_extra, size_t total_len)
-{
-	ssize_t leftover;
-
-	if (want_extra)
-		datalen += datalen / 2;
-	leftover = total_len - adjust_size(keylen, datalen);
-
-	if (leftover < (ssize_t)sizeof(struct tdb_free_record))
-		return 0;
-
-	return leftover;
-}
-
-/* We need size bytes to put our key and data in. */
-static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
-				tdb_off_t ftable_off,
-				tdb_off_t bucket,
-				size_t keylen, size_t datalen,
-				bool want_extra,
-				unsigned magic,
-				unsigned hashlow)
-{
-	tdb_off_t off, b_off,best_off;
-	struct tdb_free_record best = { 0 };
-	double multiplier;
-	size_t size = adjust_size(keylen, datalen);
-	enum TDB_ERROR ecode;
-
-	tdb->stats.allocs++;
-	b_off = bucket_off(ftable_off, bucket);
-
-	/* FIXME: Try non-blocking wait first, to measure contention. */
-	/* Lock this bucket. */
-	ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT);
-	if (ecode != TDB_SUCCESS) {
-		return TDB_ERR_TO_OFF(ecode);
-	}
-
-	best.ftable_and_len = -1ULL;
-	best_off = 0;
-
-	/* Get slack if we're after extra. */
-	if (want_extra)
-		multiplier = 1.5;
-	else
-		multiplier = 1.0;
-
-	/* Walk the list to see if any are large enough, getting less fussy
-	 * as we go. */
-	off = tdb_read_off(tdb, b_off);
-	if (TDB_OFF_IS_ERR(off)) {
-		ecode = TDB_OFF_TO_ERR(off);
-		goto unlock_err;
-	}
-	off &= TDB_OFF_MASK;
-
-	while (off) {
-		const struct tdb_free_record *r;
-		tdb_len_t len;
-		tdb_off_t next;
-
-		r = tdb_access_read(tdb, off, sizeof(*r), true);
-		if (TDB_PTR_IS_ERR(r)) {
-			ecode = TDB_PTR_ERR(r);
-			goto unlock_err;
-		}
-
-		if (frec_magic(r) != TDB_FREE_MAGIC) {
-			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-					   "lock_and_alloc:"
-					   " %llu non-free 0x%llx",
-					   (long long)off,
-					   (long long)r->magic_and_prev);
-			tdb_access_release(tdb, r);
-			goto unlock_err;
-		}
-
-		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
-			best_off = off;
-			best = *r;
-		}
-
-		if (frec_len(&best) <= size * multiplier && best_off) {
-			tdb_access_release(tdb, r);
-			break;
-		}
-
-		multiplier *= 1.01;
-
-		next = r->next;
-		len = frec_len(r);
-		tdb_access_release(tdb, r);
-		off = next;
-	}
-
-	/* If we found anything at all, use it. */
-	if (best_off) {
-		struct tdb_used_record rec;
-		size_t leftover;
-
-		/* We're happy with this size: take it. */
-		ecode = remove_from_list(tdb, b_off, best_off, &best);
-		check_list(tdb, b_off);
-		if (ecode != TDB_SUCCESS) {
-			goto unlock_err;
-		}
-
-		leftover = record_leftover(keylen, datalen, want_extra,
-					   frec_len(&best));
-
-		assert(keylen + datalen + leftover <= frec_len(&best));
-		/* We need to mark non-free before we drop lock, otherwise
-		 * coalesce() could try to merge it! */
-		ecode = set_header(tdb, &rec, magic, keylen, datalen,
-				   frec_len(&best) - leftover, hashlow);
-		if (ecode != TDB_SUCCESS) {
-			goto unlock_err;
-		}
-
-		ecode = tdb_write_convert(tdb, best_off, &rec, sizeof(rec));
-		if (ecode != TDB_SUCCESS) {
-			goto unlock_err;
-		}
-
-		/* For futureproofing, we put a 0 in any unused space. */
-		if (rec_extra_padding(&rec)) {
-			ecode = tdb->io->twrite(tdb, best_off + sizeof(rec)
-						+ keylen + datalen, "", 1);
-			if (ecode != TDB_SUCCESS) {
-				goto unlock_err;
-			}
-		}
-
-		/* Bucket of leftover will be <= current bucket, so nested
-		 * locking is allowed. */
-		if (leftover) {
-			tdb->stats.alloc_leftover++;
-			ecode = add_free_record(tdb,
-						best_off + sizeof(rec)
-						+ frec_len(&best) - leftover,
-						leftover, TDB_LOCK_WAIT, false);
-			if (ecode != TDB_SUCCESS) {
-				best_off = TDB_ERR_TO_OFF(ecode);
-			}
-		}
-		tdb_unlock_free_bucket(tdb, b_off);
-
-		return best_off;
-	}
-
-	tdb_unlock_free_bucket(tdb, b_off);
-	return 0;
-
-unlock_err:
-	tdb_unlock_free_bucket(tdb, b_off);
-	return TDB_ERR_TO_OFF(ecode);
-}
-
-/* Get a free block from current free list, or 0 if none, -ve on error. */
-static tdb_off_t get_free(struct tdb_context *tdb,
-			  size_t keylen, size_t datalen, bool want_extra,
-			  unsigned magic, unsigned hashlow)
-{
-	tdb_off_t off, ftable_off;
-	tdb_off_t start_b, b, ftable;
-	bool wrapped = false;
-
-	/* If they are growing, add 50% to get to higher bucket. */
-	if (want_extra)
-		start_b = size_to_bucket(adjust_size(keylen,
-						     datalen + datalen / 2));
-	else
-		start_b = size_to_bucket(adjust_size(keylen, datalen));
-
-	ftable_off = tdb->ftable_off;
-	ftable = tdb->ftable;
-	while (!wrapped || ftable_off != tdb->ftable_off) {
-		/* Start at exact size bucket, and search up... */
-		for (b = find_free_head(tdb, ftable_off, start_b);
-		     b < TDB_FREE_BUCKETS;
-		     b = find_free_head(tdb, ftable_off, b + 1)) {
-			/* Try getting one from list. */
-			off = lock_and_alloc(tdb, ftable_off,
-					     b, keylen, datalen, want_extra,
-					     magic, hashlow);
-			if (TDB_OFF_IS_ERR(off))
-				return off;
-			if (off != 0) {
-				if (b == start_b)
-					tdb->stats.alloc_bucket_exact++;
-				if (b == TDB_FREE_BUCKETS - 1)
-					tdb->stats.alloc_bucket_max++;
-				/* Worked?  Stay using this list. */
-				tdb->ftable_off = ftable_off;
-				tdb->ftable = ftable;
-				return off;
-			}
-			/* Didn't work.  Try next bucket. */
-		}
-
-		if (TDB_OFF_IS_ERR(b)) {
-			return b;
-		}
-
-		/* Hmm, try next table. */
-		ftable_off = next_ftable(tdb, ftable_off);
-		if (TDB_OFF_IS_ERR(ftable_off)) {
-			return ftable_off;
-		}
-		ftable++;
-
-		if (ftable_off == 0) {
-			wrapped = true;
-			ftable_off = first_ftable(tdb);
-			if (TDB_OFF_IS_ERR(ftable_off)) {
-				return ftable_off;
-			}
-			ftable = 0;
-		}
-	}
-
-	return 0;
-}
-
-enum TDB_ERROR set_header(struct tdb_context *tdb,
-			  struct tdb_used_record *rec,
-			  unsigned magic, uint64_t keylen, uint64_t datalen,
-			  uint64_t actuallen, unsigned hashlow)
-{
-	uint64_t keybits = (fls64(keylen) + 1) / 2;
-
-	/* Use bottom bits of hash, so it's independent of hash table size. */
-	rec->magic_and_meta = (hashlow & ((1 << 11)-1))
-		| ((actuallen - (keylen + datalen)) << 11)
-		| (keybits << 43)
-		| ((uint64_t)magic << 48);
-	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
-
-	/* Encoding can fail on big values. */
-	if (rec_key_length(rec) != keylen
-	    || rec_data_length(rec) != datalen
-	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "Could not encode k=%llu,d=%llu,a=%llu",
-				  (long long)keylen, (long long)datalen,
-				  (long long)actuallen);
-	}
-	return TDB_SUCCESS;
-}
-
-/* You need 'size', this tells you how much you should expand by. */
-tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size)
-{
-	tdb_off_t new_size, top_size;
-
-	/* limit size in order to avoid using up huge amounts of memory for
-	 * in memory tdbs if an oddball huge record creeps in */
-	if (size > 100 * 1024) {
-		top_size = map_size + size * 2;
-	} else {
-		top_size = map_size + size * 100;
-	}
-
-	/* always make room for at least top_size more records, and at
-	   least 25% more space. if the DB is smaller than 100MiB,
-	   otherwise grow it by 10% only. */
-	if (map_size > 100 * 1024 * 1024) {
-		new_size = map_size * 1.10;
-	} else {
-		new_size = map_size * 1.25;
-	}
-
-	/* Round the database up to a multiple of the page size */
-	if (new_size < top_size)
-		new_size = top_size;
-	return new_size - map_size;
-}
-
-/* Expand the database. */
-static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size)
-{
-	uint64_t old_size;
-	tdb_len_t wanted;
-	enum TDB_ERROR ecode;
-
-	/* Need to hold a hash lock to expand DB: transactions rely on it. */
-	if (!(tdb->flags & TDB_NOLOCK)
-	    && !tdb->file->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_expand: must hold lock during expand");
-	}
-
-	/* Only one person can expand file at a time. */
-	ecode = tdb_lock_expand(tdb, F_WRLCK);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	/* Someone else may have expanded the file, so retry. */
-	old_size = tdb->file->map_size;
-	tdb->io->oob(tdb, tdb->file->map_size, 1, true);
-	if (tdb->file->map_size != old_size) {
-		tdb_unlock_expand(tdb, F_WRLCK);
-		return TDB_SUCCESS;
-	}
-
-	/* Overallocate. */
-	wanted = tdb_expand_adjust(old_size, size);
-	/* We need room for the record header too. */
-	wanted = adjust_size(0, sizeof(struct tdb_used_record) + wanted);
-
-	ecode = tdb->io->expand_file(tdb, wanted);
-	if (ecode != TDB_SUCCESS) {
-		tdb_unlock_expand(tdb, F_WRLCK);
-		return ecode;
-	}
-
-	/* We need to drop this lock before adding free record. */
-	tdb_unlock_expand(tdb, F_WRLCK);
-
-	tdb->stats.expands++;
-	return add_free_record(tdb, old_size, wanted, TDB_LOCK_WAIT, true);
-}
-
-/* This won't fail: it will expand the database if it has to. */
-tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
-		uint64_t hash, unsigned magic, bool growing)
-{
-	tdb_off_t off;
-
-	/* We can't hold pointers during this: we could unmap! */
-	assert(!tdb->direct_access);
-
-	for (;;) {
-		enum TDB_ERROR ecode;
-		off = get_free(tdb, keylen, datalen, growing, magic, hash);
-		if (likely(off != 0))
-			break;
-
-		ecode = tdb_expand(tdb, adjust_size(keylen, datalen));
-		if (ecode != TDB_SUCCESS) {
-			return TDB_ERR_TO_OFF(ecode);
-		}
-	}
-
-	return off;
-}
diff --git a/lib/tdb2/hash.c b/lib/tdb2/hash.c
deleted file mode 100644
index 067884a74e..0000000000
--- a/lib/tdb2/hash.c
+++ /dev/null
@@ -1,894 +0,0 @@
- /*
-   Trivial Database 2: hash handling
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/hash/hash.h>
-#include <assert.h>
-
-/* Default hash function. */
-uint64_t tdb_jenkins_hash(const void *key, size_t length, uint64_t seed,
-			  void *unused)
-{
-	uint64_t ret;
-	/* hash64_stable assumes lower bits are more important; they are a
-	 * slightly better hash.  We use the upper bits first, so swap them. */
-	ret = hash64_stable((const unsigned char *)key, length, seed);
-	return (ret >> 32) | (ret << 32);
-}
-
-uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
-{
-	return tdb->hash_fn(ptr, len, tdb->hash_seed, tdb->hash_data);
-}
-
-uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
-{
-	const struct tdb_used_record *r;
-	const void *key;
-	uint64_t klen, hash;
-
-	r = tdb_access_read(tdb, off, sizeof(*r), true);
-	if (TDB_PTR_IS_ERR(r)) {
-		/* FIXME */
-		return 0;
-	}
-
-	klen = rec_key_length(r);
-	tdb_access_release(tdb, r);
-
-	key = tdb_access_read(tdb, off + sizeof(*r), klen, false);
-	if (TDB_PTR_IS_ERR(key)) {
-		return 0;
-	}
-
-	hash = tdb_hash(tdb, key, klen);
-	tdb_access_release(tdb, key);
-	return hash;
-}
-
-/* Get bits from a value. */
-static uint32_t bits_from(uint64_t val, unsigned start, unsigned num)
-{
-	assert(num <= 32);
-	return (val >> start) & ((1U << num) - 1);
-}
-
-/* We take bits from the top: that way we can lock whole sections of the hash
- * by using lock ranges. */
-static uint32_t use_bits(struct hash_info *h, unsigned num)
-{
-	h->hash_used += num;
-	return bits_from(h->h, 64 - h->hash_used, num);
-}
-
-static tdb_bool_err key_matches(struct tdb_context *tdb,
-				const struct tdb_used_record *rec,
-				tdb_off_t off,
-				const struct tdb_data *key)
-{
-	tdb_bool_err ret = false;
-	const char *rkey;
-
-	if (rec_key_length(rec) != key->dsize) {
-		tdb->stats.compare_wrong_keylen++;
-		return ret;
-	}
-
-	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
-	if (TDB_PTR_IS_ERR(rkey)) {
-		return (tdb_bool_err)TDB_PTR_ERR(rkey);
-	}
-	if (memcmp(rkey, key->dptr, key->dsize) == 0)
-		ret = true;
-	else
-		tdb->stats.compare_wrong_keycmp++;
-	tdb_access_release(tdb, rkey);
-	return ret;
-}
-
-/* Does entry match? */
-static tdb_bool_err match(struct tdb_context *tdb,
-			  struct hash_info *h,
-			  const struct tdb_data *key,
-			  tdb_off_t val,
-			  struct tdb_used_record *rec)
-{
-	tdb_off_t off;
-	enum TDB_ERROR ecode;
-
-	tdb->stats.compares++;
-	/* Desired bucket must match. */
-	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) {
-		tdb->stats.compare_wrong_bucket++;
-		return false;
-	}
-
-	/* Top bits of offset == next bits of hash. */
-	if (bits_from(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA)
-	    != bits_from(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
-		    TDB_OFF_UPPER_STEAL_EXTRA)) {
-		tdb->stats.compare_wrong_offsetbits++;
-		return false;
-	}
-
-	off = val & TDB_OFF_MASK;
-	ecode = tdb_read_convert(tdb, off, rec, sizeof(*rec));
-	if (ecode != TDB_SUCCESS) {
-		return (tdb_bool_err)ecode;
-	}
-
-	if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
-		tdb->stats.compare_wrong_rechash++;
-		return false;
-	}
-
-	return key_matches(tdb, rec, off, key);
-}
-
-static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
-{
-	return group_start
-		+ (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
-}
-
-bool is_subhash(tdb_off_t val)
-{
-	return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
-}
-
-/* FIXME: Guess the depth, don't over-lock! */
-static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
-{
-	*size = 1ULL << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
-	return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
-}
-
-static tdb_off_t COLD find_in_chain(struct tdb_context *tdb,
-				    struct tdb_data key,
-				    tdb_off_t chain,
-				    struct hash_info *h,
-				    struct tdb_used_record *rec,
-				    struct traverse_info *tinfo)
-{
-	tdb_off_t off, next;
-	enum TDB_ERROR ecode;
-
-	/* In case nothing is free, we set these to zero. */
-	h->home_bucket = h->found_bucket = 0;
-
-	for (off = chain; off; off = next) {
-		unsigned int i;
-
-		h->group_start = off;
-		ecode = tdb_read_convert(tdb, off, h->group, sizeof(h->group));
-		if (ecode != TDB_SUCCESS) {
-			return TDB_ERR_TO_OFF(ecode);
-		}
-
-		for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
-			tdb_off_t recoff;
-			if (!h->group[i]) {
-				/* Remember this empty bucket. */
-				h->home_bucket = h->found_bucket = i;
-				continue;
-			}
-
-			/* We can insert extra bits via add_to_hash
-			 * empty bucket logic. */
-			recoff = h->group[i] & TDB_OFF_MASK;
-			ecode = tdb_read_convert(tdb, recoff, rec,
-						 sizeof(*rec));
-			if (ecode != TDB_SUCCESS) {
-				return TDB_ERR_TO_OFF(ecode);
-			}
-
-			ecode = TDB_OFF_TO_ERR(key_matches(tdb, rec, recoff,
-							   &key));
-			if (ecode < 0) {
-				return TDB_ERR_TO_OFF(ecode);
-			}
-			if (ecode == (enum TDB_ERROR)1) {
-				h->home_bucket = h->found_bucket = i;
-
-				if (tinfo) {
-					tinfo->levels[tinfo->num_levels]
-						.hashtable = off;
-					tinfo->levels[tinfo->num_levels]
-						.total_buckets
-						= 1 << TDB_HASH_GROUP_BITS;
-					tinfo->levels[tinfo->num_levels].entry
-						= i;
-					tinfo->num_levels++;
-				}
-				return recoff;
-			}
-		}
-		next = tdb_read_off(tdb, off
-				    + offsetof(struct tdb_chain, next));
-		if (TDB_OFF_IS_ERR(next)) {
-			return next;
-		}
-		if (next)
-			next += sizeof(struct tdb_used_record);
-	}
-	return 0;
-}
-
-/* This is the core routine which searches the hashtable for an entry.
- * On error, no locks are held and -ve is returned.
- * Otherwise, hinfo is filled in (and the optional tinfo).
- * If not found, the return value is 0.
- * If found, the return value is the offset, and *rec is the record. */
-tdb_off_t find_and_lock(struct tdb_context *tdb,
-			struct tdb_data key,
-			int ltype,
-			struct hash_info *h,
-			struct tdb_used_record *rec,
-			struct traverse_info *tinfo)
-{
-	uint32_t i, group;
-	tdb_off_t hashtable;
-	enum TDB_ERROR ecode;
-
-	h->h = tdb_hash(tdb, key.dptr, key.dsize);
-	h->hash_used = 0;
-	group = use_bits(h, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS);
-	h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
-
-	h->hlock_start = hlock_range(group, &h->hlock_range);
-	ecode = tdb_lock_hashes(tdb, h->hlock_start, h->hlock_range, ltype,
-				TDB_LOCK_WAIT);
-	if (ecode != TDB_SUCCESS) {
-		return TDB_ERR_TO_OFF(ecode);
-	}
-
-	hashtable = offsetof(struct tdb_header, hashtable);
-	if (tinfo) {
-		tinfo->toplevel_group = group;
-		tinfo->num_levels = 1;
-		tinfo->levels[0].entry = 0;
-		tinfo->levels[0].hashtable = hashtable
-			+ (group << TDB_HASH_GROUP_BITS) * sizeof(tdb_off_t);
-		tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
-	}
-
-	while (h->hash_used <= 64) {
-		/* Read in the hash group. */
-		h->group_start = hashtable
-			+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
-
-		ecode = tdb_read_convert(tdb, h->group_start, &h->group,
-					 sizeof(h->group));
-		if (ecode != TDB_SUCCESS) {
-			goto fail;
-		}
-
-		/* Pointer to another hash table?  Go down... */
-		if (is_subhash(h->group[h->home_bucket])) {
-			hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK)
-				+ sizeof(struct tdb_used_record);
-			if (tinfo) {
-				/* When we come back, use *next* bucket */
-				tinfo->levels[tinfo->num_levels-1].entry
-					+= h->home_bucket + 1;
-			}
-			group = use_bits(h, TDB_SUBLEVEL_HASH_BITS
-					 - TDB_HASH_GROUP_BITS);
-			h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
-			if (tinfo) {
-				tinfo->levels[tinfo->num_levels].hashtable
-					= hashtable;
-				tinfo->levels[tinfo->num_levels].total_buckets
-					= 1 << TDB_SUBLEVEL_HASH_BITS;
-				tinfo->levels[tinfo->num_levels].entry
-					= group << TDB_HASH_GROUP_BITS;
-				tinfo->num_levels++;
-			}
-			continue;
-		}
-
-		/* It's in this group: search (until 0 or all searched) */
-		for (i = 0, h->found_bucket = h->home_bucket;
-		     i < (1 << TDB_HASH_GROUP_BITS);
-		     i++, h->found_bucket = ((h->found_bucket+1)
-					     % (1 << TDB_HASH_GROUP_BITS))) {
-			tdb_bool_err berr;
-			if (is_subhash(h->group[h->found_bucket]))
-				continue;
-
-			if (!h->group[h->found_bucket])
-				break;
-
-			berr = match(tdb, h, &key, h->group[h->found_bucket],
-				     rec);
-			if (berr < 0) {
-				ecode = TDB_OFF_TO_ERR(berr);
-				goto fail;
-			}
-			if (berr) {
-				if (tinfo) {
-					tinfo->levels[tinfo->num_levels-1].entry
-						+= h->found_bucket;
-				}
-				return h->group[h->found_bucket] & TDB_OFF_MASK;
-			}
-		}
-		/* Didn't find it: h indicates where it would go. */
-		return 0;
-	}
-
-	return find_in_chain(tdb, key, hashtable, h, rec, tinfo);
-
-fail:
-	tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
-	return TDB_ERR_TO_OFF(ecode);
-}
-
-/* I wrote a simple test, expanding a hash to 2GB, for the following
- * cases:
- * 1) Expanding all the buckets at once,
- * 2) Expanding the bucket we wanted to place the new entry into.
- * 3) Expanding the most-populated bucket,
- *
- * I measured the worst/average/best density during this process.
- * 1) 3%/16%/30%
- * 2) 4%/20%/38%
- * 3) 6%/22%/41%
- *
- * So we figure out the busiest bucket for the moment.
- */
-static unsigned fullest_bucket(struct tdb_context *tdb,
-			       const tdb_off_t *group,
-			       unsigned new_bucket)
-{
-	unsigned counts[1 << TDB_HASH_GROUP_BITS] = { 0 };
-	unsigned int i, best_bucket;
-
-	/* Count the new entry. */
-	counts[new_bucket]++;
-	best_bucket = new_bucket;
-
-	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
-		unsigned this_bucket;
-
-		if (is_subhash(group[i]))
-			continue;
-		this_bucket = group[i] & TDB_OFF_HASH_GROUP_MASK;
-		if (++counts[this_bucket] > counts[best_bucket])
-			best_bucket = this_bucket;
-	}
-
-	return best_bucket;
-}
-
-static bool put_into_group(tdb_off_t *group,
-			   unsigned bucket, tdb_off_t encoded)
-{
-	unsigned int i;
-
-	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
-		unsigned b = (bucket + i) % (1 << TDB_HASH_GROUP_BITS);
-
-		if (group[b] == 0) {
-			group[b] = encoded;
-			return true;
-		}
-	}
-	return false;
-}
-
-static void force_into_group(tdb_off_t *group,
-			     unsigned bucket, tdb_off_t encoded)
-{
-	if (!put_into_group(group, bucket, encoded))
-		abort();
-}
-
-static tdb_off_t encode_offset(tdb_off_t new_off, struct hash_info *h)
-{
-	return h->home_bucket
-		| new_off
-		| ((uint64_t)bits_from(h->h,
-				  64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
-				  TDB_OFF_UPPER_STEAL_EXTRA)
-		   << TDB_OFF_HASH_EXTRA_BIT);
-}
-
-/* Simply overwrite the hash entry we found before. */
-enum TDB_ERROR replace_in_hash(struct tdb_context *tdb,
-			       struct hash_info *h,
-			       tdb_off_t new_off)
-{
-	return tdb_write_off(tdb, hbucket_off(h->group_start, h->found_bucket),
-			     encode_offset(new_off, h));
-}
-
-/* We slot in anywhere that's empty in the chain. */
-static enum TDB_ERROR COLD add_to_chain(struct tdb_context *tdb,
-					tdb_off_t subhash,
-					tdb_off_t new_off)
-{
-	tdb_off_t entry;
-	enum TDB_ERROR ecode;
-
-	entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS);
-	if (TDB_OFF_IS_ERR(entry)) {
-		return TDB_OFF_TO_ERR(entry);
-	}
-
-	if (entry == 1 << TDB_HASH_GROUP_BITS) {
-		tdb_off_t next;
-
-		next = tdb_read_off(tdb, subhash
-				    + offsetof(struct tdb_chain, next));
-		if (TDB_OFF_IS_ERR(next)) {
-			return TDB_OFF_TO_ERR(next);
-		}
-
-		if (!next) {
-			next = alloc(tdb, 0, sizeof(struct tdb_chain), 0,
-				     TDB_CHAIN_MAGIC, false);
-			if (TDB_OFF_IS_ERR(next))
-				return TDB_OFF_TO_ERR(next);
-			ecode = zero_out(tdb,
-					 next+sizeof(struct tdb_used_record),
-					 sizeof(struct tdb_chain));
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-			ecode = tdb_write_off(tdb, subhash
-					      + offsetof(struct tdb_chain,
-							 next),
-					      next);
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-		}
-		return add_to_chain(tdb, next, new_off);
-	}
-
-	return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t),
-			     new_off);
-}
-
-/* Add into a newly created subhash. */
-static enum TDB_ERROR add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
-				     unsigned hash_used, tdb_off_t val)
-{
-	tdb_off_t off = (val & TDB_OFF_MASK), *group;
-	struct hash_info h;
-	unsigned int gnum;
-
-	h.hash_used = hash_used;
-
-	if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
-		return add_to_chain(tdb, subhash, off);
-
-	h.h = hash_record(tdb, off);
-	gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS);
-	h.group_start = subhash
-		+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
-	h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
-
-	group = tdb_access_write(tdb, h.group_start,
-				 sizeof(*group) << TDB_HASH_GROUP_BITS, true);
-	if (TDB_PTR_IS_ERR(group)) {
-		return TDB_PTR_ERR(group);
-	}
-	force_into_group(group, h.home_bucket, encode_offset(off, &h));
-	return tdb_access_commit(tdb, group);
-}
-
-static enum TDB_ERROR expand_group(struct tdb_context *tdb, struct hash_info *h)
-{
-	unsigned bucket, num_vals, i, magic;
-	size_t subsize;
-	tdb_off_t subhash;
-	tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
-	enum TDB_ERROR ecode;
-
-	/* Attach new empty subhash under fullest bucket. */
-	bucket = fullest_bucket(tdb, h->group, h->home_bucket);
-
-	if (h->hash_used == 64) {
-		tdb->stats.alloc_chain++;
-		subsize = sizeof(struct tdb_chain);
-		magic = TDB_CHAIN_MAGIC;
-	} else {
-		tdb->stats.alloc_subhash++;
-		subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS);
-		magic = TDB_HTABLE_MAGIC;
-	}
-
-	subhash = alloc(tdb, 0, subsize, 0, magic, false);
-	if (TDB_OFF_IS_ERR(subhash)) {
-		return TDB_OFF_TO_ERR(subhash);
-	}
-
-	ecode = zero_out(tdb, subhash + sizeof(struct tdb_used_record),
-			 subsize);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	/* Remove any which are destined for bucket or are in wrong place. */
-	num_vals = 0;
-	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
-		unsigned home_bucket = h->group[i] & TDB_OFF_HASH_GROUP_MASK;
-		if (!h->group[i] || is_subhash(h->group[i]))
-			continue;
-		if (home_bucket == bucket || home_bucket != i) {
-			vals[num_vals++] = h->group[i];
-			h->group[i] = 0;
-		}
-	}
-	/* FIXME: This assert is valid, but we do this during unit test :( */
-	/* assert(num_vals); */
-
-	/* Overwrite expanded bucket with subhash pointer. */
-	h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT);
-
-	/* Point to actual contents of record. */
-	subhash += sizeof(struct tdb_used_record);
-
-	/* Put values back. */
-	for (i = 0; i < num_vals; i++) {
-		unsigned this_bucket = vals[i] & TDB_OFF_HASH_GROUP_MASK;
-
-		if (this_bucket == bucket) {
-			ecode = add_to_subhash(tdb, subhash, h->hash_used,
-					       vals[i]);
-			if (ecode != TDB_SUCCESS)
-				return ecode;
-		} else {
-			/* There should be room to put this back. */
-			force_into_group(h->group, this_bucket, vals[i]);
-		}
-	}
-	return TDB_SUCCESS;
-}
-
-enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
-{
-	unsigned int i, num_movers = 0;
-	tdb_off_t movers[1 << TDB_HASH_GROUP_BITS];
-
-	h->group[h->found_bucket] = 0;
-	for (i = 1; i < (1 << TDB_HASH_GROUP_BITS); i++) {
-		unsigned this_bucket;
-
-		this_bucket = (h->found_bucket+i) % (1 << TDB_HASH_GROUP_BITS);
-		/* Empty bucket?  We're done. */
-		if (!h->group[this_bucket])
-			break;
-
-		/* Ignore subhashes. */
-		if (is_subhash(h->group[this_bucket]))
-			continue;
-
-		/* If this one is not happy where it is, we'll move it. */
-		if ((h->group[this_bucket] & TDB_OFF_HASH_GROUP_MASK)
-		    != this_bucket) {
-			movers[num_movers++] = h->group[this_bucket];
-			h->group[this_bucket] = 0;
-		}
-	}
-
-	/* Put back the ones we erased. */
-	for (i = 0; i < num_movers; i++) {
-		force_into_group(h->group, movers[i] & TDB_OFF_HASH_GROUP_MASK,
-				 movers[i]);
-	}
-
-	/* Now we write back the hash group */
-	return tdb_write_convert(tdb, h->group_start,
-				 h->group, sizeof(h->group));
-}
-
-enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h,
-			   tdb_off_t new_off)
-{
-	enum TDB_ERROR ecode;
-
-	/* We hit an empty bucket during search?  That's where it goes. */
-	if (!h->group[h->found_bucket]) {
-		h->group[h->found_bucket] = encode_offset(new_off, h);
-		/* Write back the modified group. */
-		return tdb_write_convert(tdb, h->group_start,
-					 h->group, sizeof(h->group));
-	}
-
-	if (h->hash_used > 64)
-		return add_to_chain(tdb, h->group_start, new_off);
-
-	/* We're full.  Expand. */
-	ecode = expand_group(tdb, h);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	if (is_subhash(h->group[h->home_bucket])) {
-		/* We were expanded! */
-		tdb_off_t hashtable;
-		unsigned int gnum;
-
-		/* Write back the modified group. */
-		ecode = tdb_write_convert(tdb, h->group_start, h->group,
-					  sizeof(h->group));
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-
-		/* Move hashinfo down a level. */
-		hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK)
-			+ sizeof(struct tdb_used_record);
-		gnum = use_bits(h,TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS);
-		h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
-		h->group_start = hashtable
-			+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
-		ecode = tdb_read_convert(tdb, h->group_start, &h->group,
-					 sizeof(h->group));
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-	}
-
-	/* Expanding the group must have made room if it didn't choose this
-	 * bucket. */
-	if (put_into_group(h->group, h->home_bucket, encode_offset(new_off,h))){
-		return tdb_write_convert(tdb, h->group_start,
-					 h->group, sizeof(h->group));
-	}
-
-	/* This can happen if all hashes in group (and us) dropped into same
-	 * group in subhash. */
-	return add_to_hash(tdb, h, new_off);
-}
-
-/* Traverse support: returns offset of record, or 0 or -ve error. */
-static tdb_off_t iterate_hash(struct tdb_context *tdb,
-			      struct traverse_info *tinfo)
-{
-	tdb_off_t off, val, i;
-	struct traverse_level *tlevel;
-
-	tlevel = &tinfo->levels[tinfo->num_levels-1];
-
-again:
-	for (i = tdb_find_nonzero_off(tdb, tlevel->hashtable,
-				      tlevel->entry, tlevel->total_buckets);
-	     i != tlevel->total_buckets;
-	     i = tdb_find_nonzero_off(tdb, tlevel->hashtable,
-				      i+1, tlevel->total_buckets)) {
-		if (TDB_OFF_IS_ERR(i)) {
-			return i;
-		}
-
-		val = tdb_read_off(tdb, tlevel->hashtable+sizeof(tdb_off_t)*i);
-		if (TDB_OFF_IS_ERR(val)) {
-			return val;
-		}
-
-		off = val & TDB_OFF_MASK;
-
-		/* This makes the delete-all-in-traverse case work
-		 * (and simplifies our logic a little). */
-		if (off == tinfo->prev)
-			continue;
-
-		tlevel->entry = i;
-
-		if (!is_subhash(val)) {
-			/* Found one. */
-			tinfo->prev = off;
-			return off;
-		}
-
-		/* When we come back, we want the next one */
-		tlevel->entry++;
-		tinfo->num_levels++;
-		tlevel++;
-		tlevel->hashtable = off + sizeof(struct tdb_used_record);
-		tlevel->entry = 0;
-		/* Next level is a chain? */
-		if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1))
-			tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS);
-		else
-			tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
-		goto again;
-	}
-
-	/* Nothing there? */
-	if (tinfo->num_levels == 1)
-		return 0;
-
-	/* Handle chained entries. */
-	if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) {
-		tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable
-						 + offsetof(struct tdb_chain,
-							    next));
-		if (TDB_OFF_IS_ERR(tlevel->hashtable)) {
-			return tlevel->hashtable;
-		}
-		if (tlevel->hashtable) {
-			tlevel->hashtable += sizeof(struct tdb_used_record);
-			tlevel->entry = 0;
-			goto again;
-		}
-	}
-
-	/* Go back up and keep searching. */
-	tinfo->num_levels--;
-	tlevel--;
-	goto again;
-}
-
-/* Return success if we find something, TDB_ERR_NOEXIST if none. */
-enum TDB_ERROR next_in_hash(struct tdb_context *tdb,
-			    struct traverse_info *tinfo,
-			    TDB_DATA *kbuf, size_t *dlen)
-{
-	const unsigned group_bits = TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS;
-	tdb_off_t hl_start, hl_range, off;
-	enum TDB_ERROR ecode;
-
-	while (tinfo->toplevel_group < (1 << group_bits)) {
-		hl_start = (tdb_off_t)tinfo->toplevel_group
-			<< (64 - group_bits);
-		hl_range = 1ULL << group_bits;
-		ecode = tdb_lock_hashes(tdb, hl_start, hl_range, F_RDLCK,
-					TDB_LOCK_WAIT);
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-
-		off = iterate_hash(tdb, tinfo);
-		if (off) {
-			struct tdb_used_record rec;
-
-			if (TDB_OFF_IS_ERR(off)) {
-				ecode = TDB_OFF_TO_ERR(off);
-				goto fail;
-			}
-
-			ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
-			if (ecode != TDB_SUCCESS) {
-				goto fail;
-			}
-			if (rec_magic(&rec) != TDB_USED_MAGIC) {
-				ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
-						   TDB_LOG_ERROR,
-						   "next_in_hash:"
-						   " corrupt record at %llu",
-						   (long long)off);
-				goto fail;
-			}
-
-			kbuf->dsize = rec_key_length(&rec);
-
-			/* They want data as well? */
-			if (dlen) {
-				*dlen = rec_data_length(&rec);
-				kbuf->dptr = tdb_alloc_read(tdb,
-							    off + sizeof(rec),
-							    kbuf->dsize
-							    + *dlen);
-			} else {
-				kbuf->dptr = tdb_alloc_read(tdb,
-							    off + sizeof(rec),
-							    kbuf->dsize);
-			}
-			tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
-			if (TDB_PTR_IS_ERR(kbuf->dptr)) {
-				return TDB_PTR_ERR(kbuf->dptr);
-			}
-			return TDB_SUCCESS;
-		}
-
-		tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
-
-		tinfo->toplevel_group++;
-		tinfo->levels[0].hashtable
-			+= (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
-		tinfo->levels[0].entry = 0;
-	}
-	return TDB_ERR_NOEXIST;
-
-fail:
-	tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
-	return ecode;
-
-}
-
-enum TDB_ERROR first_in_hash(struct tdb_context *tdb,
-			     struct traverse_info *tinfo,
-			     TDB_DATA *kbuf, size_t *dlen)
-{
-	tinfo->prev = 0;
-	tinfo->toplevel_group = 0;
-	tinfo->num_levels = 1;
-	tinfo->levels[0].hashtable = offsetof(struct tdb_header, hashtable);
-	tinfo->levels[0].entry = 0;
-	tinfo->levels[0].total_buckets = (1 << TDB_HASH_GROUP_BITS);
-
-	return next_in_hash(tdb, tinfo, kbuf, dlen);
-}
-
-/* Even if the entry isn't in this hash bucket, you'd have to lock this
- * bucket to find it. */
-static enum TDB_ERROR chainlock(struct tdb_context *tdb, const TDB_DATA *key,
-				int ltype, enum tdb_lock_flags waitflag,
-				const char *func)
-{
-	enum TDB_ERROR ecode;
-	uint64_t h = tdb_hash(tdb, key->dptr, key->dsize);
-	tdb_off_t lockstart, locksize;
-	unsigned int group, gbits;
-
-	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
-	group = bits_from(h, 64 - gbits, gbits);
-
-	lockstart = hlock_range(group, &locksize);
-
-	ecode = tdb_lock_hashes(tdb, lockstart, locksize, ltype, waitflag);
-	tdb_trace_1rec(tdb, func, *key);
-	return ecode;
-}
-
-/* lock/unlock one hash chain. This is meant to be used to reduce
-   contention - it cannot guarantee how many records will be locked */
-_PUBLIC_ enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
-{
-	return tdb->last_error = chainlock(tdb, &key, F_WRLCK, TDB_LOCK_WAIT,
-					   "tdb_chainlock");
-}
-
-_PUBLIC_ void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
-{
-	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
-	tdb_off_t lockstart, locksize;
-	unsigned int group, gbits;
-
-	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
-	group = bits_from(h, 64 - gbits, gbits);
-
-	lockstart = hlock_range(group, &locksize);
-
-	tdb_trace_1rec(tdb, "tdb_chainunlock", key);
-	tdb_unlock_hashes(tdb, lockstart, locksize, F_WRLCK);
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
-{
-	return tdb->last_error = chainlock(tdb, &key, F_RDLCK, TDB_LOCK_WAIT,
-					   "tdb_chainlock_read");
-}
-
-_PUBLIC_ void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
-{
-	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
-	tdb_off_t lockstart, locksize;
-	unsigned int group, gbits;
-
-	gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
-	group = bits_from(h, 64 - gbits, gbits);
-
-	lockstart = hlock_range(group, &locksize);
-
-	tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
-	tdb_unlock_hashes(tdb, lockstart, locksize, F_RDLCK);
-}
diff --git a/lib/tdb2/io.c b/lib/tdb2/io.c
deleted file mode 100644
index ca044ae361..0000000000
--- a/lib/tdb2/io.c
+++ /dev/null
@@ -1,650 +0,0 @@
- /*
-   Unix SMB/CIFS implementation.
-
-   trivial database library
-
-   Copyright (C) Andrew Tridgell              1999-2005
-   Copyright (C) Paul `Rusty' Russell		   2000
-   Copyright (C) Jeremy Allison			   2000-2003
-   Copyright (C) Rusty Russell			   2010
-
-     ** NOTE! The following LGPL license applies to the tdb
-     ** library. This does NOT imply that all of Samba is released
-     ** under the LGPL
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <assert.h>
-#include <ccan/likely/likely.h>
-
-void tdb_munmap(struct tdb_file *file)
-{
-	if (file->fd == -1)
-		return;
-
-	if (file->map_ptr) {
-		munmap(file->map_ptr, file->map_size);
-		file->map_ptr = NULL;
-	}
-}
-
-enum TDB_ERROR tdb_mmap(struct tdb_context *tdb)
-{
-	int mmap_flags;
-
-	if (tdb->flags & TDB_INTERNAL)
-		return TDB_SUCCESS;
-
-#ifndef HAVE_INCOHERENT_MMAP
-	if (tdb->flags & TDB_NOMMAP)
-		return TDB_SUCCESS;
-#endif
-
-	if ((tdb->open_flags & O_ACCMODE) == O_RDONLY)
-		mmap_flags = PROT_READ;
-	else
-		mmap_flags = PROT_READ | PROT_WRITE;
-
-	/* size_t can be smaller than off_t. */
-	if ((size_t)tdb->file->map_size == tdb->file->map_size) {
-		tdb->file->map_ptr = mmap(NULL, tdb->file->map_size,
-					  mmap_flags,
-					  MAP_SHARED, tdb->file->fd, 0);
-	} else
-		tdb->file->map_ptr = MAP_FAILED;
-
-	/*
-	 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
-	 */
-	if (tdb->file->map_ptr == MAP_FAILED) {
-		tdb->file->map_ptr = NULL;
-#ifdef HAVE_INCOHERENT_MMAP
-		/* Incoherent mmap means everyone must mmap! */
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "tdb_mmap failed for size %lld (%s)",
-				  (long long)tdb->file->map_size,
-				  strerror(errno));
-#else
-		tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
-			   "tdb_mmap failed for size %lld (%s)",
-			   (long long)tdb->file->map_size, strerror(errno));
-#endif
-	}
-	return TDB_SUCCESS;
-}
-
-/* check for an out of bounds access - if it is out of bounds then
-   see if the database has been expanded by someone else and expand
-   if necessary
-   note that "len" is the minimum length needed for the db.
-
-   If probe is true, len being too large isn't a failure.
-*/
-static enum TDB_ERROR tdb_oob(struct tdb_context *tdb,
-			      tdb_off_t off, tdb_len_t len, bool probe)
-{
-	struct stat st;
-	enum TDB_ERROR ecode;
-
-	/* We can't hold pointers during this: we could unmap! */
-	assert(!tdb->direct_access
-	       || (tdb->flags & TDB_NOLOCK)
-	       || tdb_has_expansion_lock(tdb));
-
-	if (len + off < len) {
-		if (probe)
-			return TDB_SUCCESS;
-
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "tdb_oob off %llu len %llu wrap\n",
-				  (long long)off, (long long)len);
-	}
-
-	if (len + off <= tdb->file->map_size)
-		return TDB_SUCCESS;
-	if (tdb->flags & TDB_INTERNAL) {
-		if (probe)
-			return TDB_SUCCESS;
-
-		tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-			   "tdb_oob len %lld beyond internal"
-			   " malloc size %lld",
-			   (long long)(off + len),
-			   (long long)tdb->file->map_size);
-		return TDB_ERR_IO;
-	}
-
-	ecode = tdb_lock_expand(tdb, F_RDLCK);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	if (fstat(tdb->file->fd, &st) != 0) {
-		tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-			   "Failed to fstat file: %s", strerror(errno));
-		tdb_unlock_expand(tdb, F_RDLCK);
-		return TDB_ERR_IO;
-	}
-
-	tdb_unlock_expand(tdb, F_RDLCK);
-
-	if (st.st_size < off + len) {
-		if (probe)
-			return TDB_SUCCESS;
-
-		tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-			   "tdb_oob len %llu beyond eof at %llu",
-			   (long long)(off + len), (long long)st.st_size);
-		return TDB_ERR_IO;
-	}
-
-	/* Unmap, update size, remap */
-	tdb_munmap(tdb->file);
-
-	tdb->file->map_size = st.st_size;
-	return tdb_mmap(tdb);
-}
-
-/* Endian conversion: we only ever deal with 8 byte quantities */
-void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
-{
-	assert(size % 8 == 0);
-	if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
-		uint64_t i, *p = (uint64_t *)buf;
-		for (i = 0; i < size / 8; i++)
-			p[i] = bswap_64(p[i]);
-	}
-	return buf;
-}
-
-/* Return first non-zero offset in offset array, or end, or -ve error. */
-/* FIXME: Return the off? */
-uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
-			      tdb_off_t base, uint64_t start, uint64_t end)
-{
-	uint64_t i;
-	const uint64_t *val;
-
-	/* Zero vs non-zero is the same unconverted: minor optimization. */
-	val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
-			      (end - start) * sizeof(tdb_off_t), false);
-	if (TDB_PTR_IS_ERR(val)) {
-		return TDB_ERR_TO_OFF(TDB_PTR_ERR(val));
-	}
-
-	for (i = 0; i < (end - start); i++) {
-		if (val[i])
-			break;
-	}
-	tdb_access_release(tdb, val);
-	return start + i;
-}
-
-/* Return first zero offset in num offset array, or num, or -ve error. */
-uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
-			   uint64_t num)
-{
-	uint64_t i;
-	const uint64_t *val;
-
-	/* Zero vs non-zero is the same unconverted: minor optimization. */
-	val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
-	if (TDB_PTR_IS_ERR(val)) {
-		return TDB_ERR_TO_OFF(TDB_PTR_ERR(val));
-	}
-
-	for (i = 0; i < num; i++) {
-		if (!val[i])
-			break;
-	}
-	tdb_access_release(tdb, val);
-	return i;
-}
-
-enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
-{
-	char buf[8192] = { 0 };
-	void *p = tdb->io->direct(tdb, off, len, true);
-	enum TDB_ERROR ecode = TDB_SUCCESS;
-
-	assert(!(tdb->flags & TDB_RDONLY));
-	if (TDB_PTR_IS_ERR(p)) {
-		return TDB_PTR_ERR(p);
-	}
-	if (p) {
-		memset(p, 0, len);
-		return ecode;
-	}
-	while (len) {
-		unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
-		ecode = tdb->io->twrite(tdb, off, buf, todo);
-		if (ecode != TDB_SUCCESS) {
-			break;
-		}
-		len -= todo;
-		off += todo;
-	}
-	return ecode;
-}
-
-tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
-{
-	tdb_off_t ret;
-	enum TDB_ERROR ecode;
-
-	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		tdb_off_t *p = tdb->io->direct(tdb, off, sizeof(*p), false);
-		if (TDB_PTR_IS_ERR(p)) {
-			return TDB_ERR_TO_OFF(TDB_PTR_ERR(p));
-		}
-		if (p)
-			return *p;
-	}
-
-	ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret));
-	if (ecode != TDB_SUCCESS) {
-		return TDB_ERR_TO_OFF(ecode);
-	}
-	return ret;
-}
-
-/* write a lump of data at a specified offset */
-static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off,
-				const void *buf, tdb_len_t len)
-{
-	enum TDB_ERROR ecode;
-
-	if (tdb->flags & TDB_RDONLY) {
-		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
-				  "Write to read-only database");
-	}
-
-	ecode = tdb->io->oob(tdb, off, len, false);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	if (tdb->file->map_ptr) {
-		memcpy(off + (char *)tdb->file->map_ptr, buf, len);
-	} else {
-#ifdef HAVE_INCOHERENT_MMAP
-		return TDB_ERR_IO;
-#else
-		ssize_t ret;
-		ret = pwrite(tdb->file->fd, buf, len, off);
-		if (ret != len) {
-			/* This shouldn't happen: we avoid sparse files. */
-			if (ret >= 0)
-				errno = ENOSPC;
-
-			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-					  "tdb_write: %zi at %zu len=%zu (%s)",
-					  ret, (size_t)off, (size_t)len,
-					  strerror(errno));
-		}
-#endif
-	}
-	return TDB_SUCCESS;
-}
-
-/* read a lump of data at a specified offset */
-static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off,
-			       void *buf, tdb_len_t len)
-{
-	enum TDB_ERROR ecode;
-
-	ecode = tdb->io->oob(tdb, off, len, false);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	if (tdb->file->map_ptr) {
-		memcpy(buf, off + (char *)tdb->file->map_ptr, len);
-	} else {
-#ifdef HAVE_INCOHERENT_MMAP
-		return TDB_ERR_IO;
-#else
-		ssize_t r = pread(tdb->file->fd, buf, len, off);
-		if (r != len) {
-			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-					  "tdb_read failed with %zi at %zu "
-					  "len=%zu (%s) map_size=%zu",
-					  r, (size_t)off, (size_t)len,
-					  strerror(errno),
-					  (size_t)tdb->file->map_size);
-		}
-#endif
-	}
-	return TDB_SUCCESS;
-}
-
-enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
-				 const void *rec, size_t len)
-{
-	enum TDB_ERROR ecode;
-
-	if (unlikely((tdb->flags & TDB_CONVERT))) {
-		void *conv = malloc(len);
-		if (!conv) {
-			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-					  "tdb_write: no memory converting"
-					  " %zu bytes", len);
-		}
-		memcpy(conv, rec, len);
-		ecode = tdb->io->twrite(tdb, off,
-					tdb_convert(tdb, conv, len), len);
-		free(conv);
-	} else {
-		ecode = tdb->io->twrite(tdb, off, rec, len);
-	}
-	return ecode;
-}
-
-enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
-				void *rec, size_t len)
-{
-	enum TDB_ERROR ecode = tdb->io->tread(tdb, off, rec, len);
-	tdb_convert(tdb, rec, len);
-	return ecode;
-}
-
-enum TDB_ERROR tdb_write_off(struct tdb_context *tdb,
-			     tdb_off_t off, tdb_off_t val)
-{
-	if (tdb->flags & TDB_RDONLY) {
-		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
-				  "Write to read-only database");
-	}
-
-	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		tdb_off_t *p = tdb->io->direct(tdb, off, sizeof(*p), true);
-		if (TDB_PTR_IS_ERR(p)) {
-			return TDB_PTR_ERR(p);
-		}
-		if (p) {
-			*p = val;
-			return TDB_SUCCESS;
-		}
-	}
-	return tdb_write_convert(tdb, off, &val, sizeof(val));
-}
-
-static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
-			     tdb_len_t len, unsigned int prefix)
-{
-	unsigned char *buf;
-	enum TDB_ERROR ecode;
-
-	/* some systems don't like zero length malloc */
-	buf = malloc(prefix + len ? prefix + len : 1);
-	if (!buf) {
-		tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR,
-			   "tdb_alloc_read malloc failed len=%zu",
-			   (size_t)(prefix + len));
-		return TDB_ERR_PTR(TDB_ERR_OOM);
-	} else {
-		ecode = tdb->io->tread(tdb, offset, buf+prefix, len);
-		if (unlikely(ecode != TDB_SUCCESS)) {
-			free(buf);
-			return TDB_ERR_PTR(ecode);
-		}
-	}
-	return buf;
-}
-
-/* read a lump of data, allocating the space for it */
-void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
-{
-	return _tdb_alloc_read(tdb, offset, len, 0);
-}
-
-static enum TDB_ERROR fill(struct tdb_context *tdb,
-			   const void *buf, size_t size,
-			   tdb_off_t off, tdb_len_t len)
-{
-	while (len) {
-		size_t n = len > size ? size : len;
-		ssize_t ret = pwrite(tdb->file->fd, buf, n, off);
-		if (ret != n) {
-			if (ret >= 0)
-				errno = ENOSPC;
-
-			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-					  "fill failed:"
-					  " %zi at %zu len=%zu (%s)",
-					  ret, (size_t)off, (size_t)len,
-					  strerror(errno));
-		}
-		len -= n;
-		off += n;
-	}
-	return TDB_SUCCESS;
-}
-
-/* expand a file.  we prefer to use ftruncate, as that is what posix
-  says to use for mmap expansion */
-static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb,
-				      tdb_len_t addition)
-{
-	char buf[8192];
-	enum TDB_ERROR ecode;
-
-	if (tdb->flags & TDB_RDONLY) {
-		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
-				  "Expand on read-only database");
-	}
-
-	if (tdb->flags & TDB_INTERNAL) {
-		char *new = realloc(tdb->file->map_ptr,
-				    tdb->file->map_size + addition);
-		if (!new) {
-			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-					  "No memory to expand database");
-		}
-		tdb->file->map_ptr = new;
-		tdb->file->map_size += addition;
-		return TDB_SUCCESS;
-	} else {
-		/* Unmap before trying to write; old TDB claimed OpenBSD had
-		 * problem with this otherwise. */
-		tdb_munmap(tdb->file);
-
-		/* If this fails, we try to fill anyway. */
-		if (ftruncate(tdb->file->fd, tdb->file->map_size + addition))
-			;
-
-		/* now fill the file with something. This ensures that the
-		   file isn't sparse, which would be very bad if we ran out of
-		   disk. This must be done with write, not via mmap */
-		memset(buf, 0x43, sizeof(buf));
-		ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size,
-			     addition);
-		if (ecode != TDB_SUCCESS)
-			return ecode;
-		tdb->file->map_size += addition;
-		return tdb_mmap(tdb);
-	}
-}
-
-const void *tdb_access_read(struct tdb_context *tdb,
-			    tdb_off_t off, tdb_len_t len, bool convert)
-{
-	void *ret = NULL;
-
-	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		ret = tdb->io->direct(tdb, off, len, false);
-
-		if (TDB_PTR_IS_ERR(ret)) {
-			return ret;
-		}
-	}
-	if (!ret) {
-		struct tdb_access_hdr *hdr;
-		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
-		if (TDB_PTR_IS_ERR(hdr)) {
-			return hdr;
-		}
-		hdr->next = tdb->access;
-		tdb->access = hdr;
-		ret = hdr + 1;
-		if (convert) {
-			tdb_convert(tdb, (void *)ret, len);
-		}
-	} else
-		tdb->direct_access++;
-
-	return ret;
-}
-
-void *tdb_access_write(struct tdb_context *tdb,
-		       tdb_off_t off, tdb_len_t len, bool convert)
-{
-	void *ret = NULL;
-
-	if (tdb->flags & TDB_RDONLY) {
-		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
-			   "Write to read-only database");
-		return TDB_ERR_PTR(TDB_ERR_RDONLY);
-	}
-
-	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		ret = tdb->io->direct(tdb, off, len, true);
-
-		if (TDB_PTR_IS_ERR(ret)) {
-			return ret;
-		}
-	}
-
-	if (!ret) {
-		struct tdb_access_hdr *hdr;
-		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
-		if (TDB_PTR_IS_ERR(hdr)) {
-			return hdr;
-		}
-		hdr->next = tdb->access;
-		tdb->access = hdr;
-		hdr->off = off;
-		hdr->len = len;
-		hdr->convert = convert;
-		ret = hdr + 1;
-		if (convert)
-			tdb_convert(tdb, (void *)ret, len);
-	} else
-		tdb->direct_access++;
-
-	return ret;
-}
-
-static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
-{
-	struct tdb_access_hdr **hp;
-
-	for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
-		if (*hp + 1 == p)
-			return hp;
-	}
-	return NULL;
-}
-
-void tdb_access_release(struct tdb_context *tdb, const void *p)
-{
-	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
-
-	if (hp) {
-		hdr = *hp;
-		*hp = hdr->next;
-		free(hdr);
-	} else
-		tdb->direct_access--;
-}
-
-enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p)
-{
-	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
-	enum TDB_ERROR ecode;
-
-	if (hp) {
-		hdr = *hp;
-		if (hdr->convert)
-			ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len);
-		else
-			ecode = tdb_write(tdb, hdr->off, p, hdr->len);
-		*hp = hdr->next;
-		free(hdr);
-	} else {
-		tdb->direct_access--;
-		ecode = TDB_SUCCESS;
-	}
-
-	return ecode;
-}
-
-static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
-			bool write_mode)
-{
-	enum TDB_ERROR ecode;
-
-	if (unlikely(!tdb->file->map_ptr))
-		return NULL;
-
-	ecode = tdb_oob(tdb, off, len, false);
-	if (unlikely(ecode != TDB_SUCCESS))
-		return TDB_ERR_PTR(ecode);
-	return (char *)tdb->file->map_ptr + off;
-}
-
-void tdb_inc_seqnum(struct tdb_context *tdb)
-{
-	tdb_off_t seq;
-
-	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		int64_t *direct;
-
-		direct = tdb->io->direct(tdb,
-					 offsetof(struct tdb_header, seqnum),
-					 sizeof(*direct), true);
-		if (likely(direct)) {
-			/* Don't let it go negative, even briefly */
-			if (unlikely((*direct) + 1) < 0)
-				*direct = 0;
-			(*direct)++;
-			return;
-		}
-	}
-
-	seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
-	if (!TDB_OFF_IS_ERR(seq)) {
-		seq++;
-		if (unlikely((int64_t)seq < 0))
-			seq = 0;
-		tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq);
-	}
-}
-
-static const struct tdb_methods io_methods = {
-	tdb_read,
-	tdb_write,
-	tdb_oob,
-	tdb_expand_file,
-	tdb_direct,
-};
-
-/*
-  initialise the default methods table
-*/
-void tdb_io_init(struct tdb_context *tdb)
-{
-	tdb->io = &io_methods;
-}
diff --git a/lib/tdb2/lock.c b/lib/tdb2/lock.c
deleted file mode 100644
index b0583546fb..0000000000
--- a/lib/tdb2/lock.c
+++ /dev/null
@@ -1,883 +0,0 @@
- /*
-   Unix SMB/CIFS implementation.
-
-   trivial database library
-
-   Copyright (C) Andrew Tridgell              1999-2005
-   Copyright (C) Paul `Rusty' Russell		   2000
-   Copyright (C) Jeremy Allison			   2000-2003
-
-     ** NOTE! The following LGPL license applies to the tdb
-     ** library. This does NOT imply that all of Samba is released
-     ** under the LGPL
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "private.h"
-#include <assert.h>
-#include <ccan/build_assert/build_assert.h>
-
-/* If we were threaded, we could wait for unlock, but we're not, so fail. */
-enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call)
-{
-	return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
-			  "%s: lock owned by another tdb in this process.",
-			  call);
-}
-
-/* If we fork, we no longer really own locks. */
-bool check_lock_pid(struct tdb_context *tdb, const char *call, bool log)
-{
-	/* No locks?  No problem! */
-	if (tdb->file->allrecord_lock.count == 0
-	    && tdb->file->num_lockrecs == 0) {
-		return true;
-	}
-
-	/* No fork?  No problem! */
-	if (tdb->file->locker == getpid()) {
-		return true;
-	}
-
-	if (log) {
-		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
-			   "%s: fork() detected after lock acquisition!"
-			   " (%u vs %u)", call, tdb->file->locker, getpid());
-	}
-	return false;
-}
-
-int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
-		   void *unused)
-{
-	struct flock fl;
-	int ret;
-
-	do {
-		fl.l_type = rw;
-		fl.l_whence = SEEK_SET;
-		fl.l_start = off;
-		fl.l_len = len;
-
-		if (waitflag)
-			ret = fcntl(fd, F_SETLKW, &fl);
-		else
-			ret = fcntl(fd, F_SETLK, &fl);
-	} while (ret != 0 && errno == EINTR);
-	return ret;
-}
-
-int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
-{
-	struct flock fl;
-	int ret;
-
-	do {
-		fl.l_type = F_UNLCK;
-		fl.l_whence = SEEK_SET;
-		fl.l_start = off;
-		fl.l_len = len;
-
-		ret = fcntl(fd, F_SETLKW, &fl);
-	} while (ret != 0 && errno == EINTR);
-	return ret;
-}
-
-static int lock(struct tdb_context *tdb,
-		      int rw, off_t off, off_t len, bool waitflag)
-{
-	int ret;
-	if (tdb->file->allrecord_lock.count == 0
-	    && tdb->file->num_lockrecs == 0) {
-		tdb->file->locker = getpid();
-	}
-
-	tdb->stats.lock_lowlevel++;
-	ret = tdb->lock_fn(tdb->file->fd, rw, off, len, waitflag,
-			   tdb->lock_data);
-	if (!waitflag) {
-		tdb->stats.lock_nonblock++;
-		if (ret != 0)
-			tdb->stats.lock_nonblock_fail++;
-	}
-	return ret;
-}
-
-static int unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
-{
-#if 0 /* Check they matched up locks and unlocks correctly. */
-	char line[80];
-	FILE *locks;
-	bool found = false;
-
-	locks = fopen("/proc/locks", "r");
-
-	while (fgets(line, 80, locks)) {
-		char *p;
-		int type, start, l;
-
-		/* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
-		p = strchr(line, ':') + 1;
-		if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
-			continue;
-		p += strlen(" FLOCK  ADVISORY  ");
-		if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
-			type = F_RDLCK;
-		else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
-			type = F_WRLCK;
-		else
-			abort();
-		p += 6;
-		if (atoi(p) != getpid())
-			continue;
-		p = strchr(strchr(p, ' ') + 1, ' ') + 1;
-		start = atoi(p);
-		p = strchr(p, ' ') + 1;
-		if (strncmp(p, "EOF", 3) == 0)
-			l = 0;
-		else
-			l = atoi(p) - start + 1;
-
-		if (off == start) {
-			if (len != l) {
-				fprintf(stderr, "Len %u should be %u: %s",
-					(int)len, l, line);
-				abort();
-			}
-			if (type != rw) {
-				fprintf(stderr, "Type %s wrong: %s",
-					rw == F_RDLCK ? "READ" : "WRITE", line);
-				abort();
-			}
-			found = true;
-			break;
-		}
-	}
-
-	if (!found) {
-		fprintf(stderr, "Unlock on %u@%u not found!",
-			(int)off, (int)len);
-		abort();
-	}
-
-	fclose(locks);
-#endif
-
-	return tdb->unlock_fn(tdb->file->fd, rw, off, len, tdb->lock_data);
-}
-
-/* a byte range locking function - return 0 on success
-   this functions locks len bytes at the specified offset.
-
-   note that a len of zero means lock to end of file
-*/
-static enum TDB_ERROR tdb_brlock(struct tdb_context *tdb,
-				 int rw_type, tdb_off_t offset, tdb_off_t len,
-				 enum tdb_lock_flags flags)
-{
-	int ret;
-
-	if (tdb->flags & TDB_NOLOCK) {
-		return TDB_SUCCESS;
-	}
-
-	if (rw_type == F_WRLCK && (tdb->flags & TDB_RDONLY)) {
-		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
-				  "Write lock attempted on read-only database");
-	}
-
-	/* A 32 bit system cannot open a 64-bit file, but it could have
-	 * expanded since then: check here. */
-	if ((size_t)(offset + len) != offset + len) {
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "tdb_brlock: lock on giant offset %llu",
-				  (long long)(offset + len));
-	}
-
-	ret = lock(tdb, rw_type, offset, len, flags & TDB_LOCK_WAIT);
-	if (ret != 0) {
-		/* Generic lock error. errno set by fcntl.
-		 * EAGAIN is an expected return from non-blocking
-		 * locks. */
-		if (!(flags & TDB_LOCK_PROBE)
-		    && (errno != EAGAIN && errno != EINTR)) {
-			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				   "tdb_brlock failed (fd=%d) at"
-				   " offset %zu rw_type=%d flags=%d len=%zu:"
-				   " %s",
-				   tdb->file->fd, (size_t)offset, rw_type,
-				   flags, (size_t)len, strerror(errno));
-		}
-		return TDB_ERR_LOCK;
-	}
-	return TDB_SUCCESS;
-}
-
-static enum TDB_ERROR tdb_brunlock(struct tdb_context *tdb,
-				   int rw_type, tdb_off_t offset, size_t len)
-{
-	if (tdb->flags & TDB_NOLOCK) {
-		return TDB_SUCCESS;
-	}
-
-	if (!check_lock_pid(tdb, "tdb_brunlock", true))
-		return TDB_ERR_LOCK;
-
-	if (unlock(tdb, rw_type, offset, len) == -1) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_brunlock failed (fd=%d) at offset %zu"
-				  " rw_type=%d len=%zu: %s",
-				  tdb->file->fd, (size_t)offset, rw_type,
-				  (size_t)len, strerror(errno));
-	}
-	return TDB_SUCCESS;
-}
-
-/*
-  upgrade a read lock to a write lock. This needs to be handled in a
-  special way as some OSes (such as solaris) have too conservative
-  deadlock detection and claim a deadlock when progress can be
-  made. For those OSes we may loop for a while.
-*/
-enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb, off_t start)
-{
-	int count = 1000;
-
-	if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
-		return TDB_ERR_LOCK;
-
-	if (tdb->file->allrecord_lock.count != 1) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_allrecord_upgrade failed:"
-				  " count %u too high",
-				  tdb->file->allrecord_lock.count);
-	}
-
-	if (tdb->file->allrecord_lock.off != 1) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_allrecord_upgrade failed:"
-				  " already upgraded?");
-	}
-
-	if (tdb->file->allrecord_lock.owner != tdb) {
-		return owner_conflict(tdb, "tdb_allrecord_upgrade");
-	}
-
-	while (count--) {
-		struct timeval tv;
-		if (tdb_brlock(tdb, F_WRLCK, start, 0,
-			       TDB_LOCK_WAIT|TDB_LOCK_PROBE) == TDB_SUCCESS) {
-			tdb->file->allrecord_lock.ltype = F_WRLCK;
-			tdb->file->allrecord_lock.off = 0;
-			return TDB_SUCCESS;
-		}
-		if (errno != EDEADLK) {
-			break;
-		}
-		/* sleep for as short a time as we can - more portable than usleep() */
-		tv.tv_sec = 0;
-		tv.tv_usec = 1;
-		select(0, NULL, NULL, NULL, &tv);
-	}
-
-	if (errno != EAGAIN && errno != EINTR)
-		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-			   "tdb_allrecord_upgrade failed");
-	return TDB_ERR_LOCK;
-}
-
-static struct tdb_lock *find_nestlock(struct tdb_context *tdb, tdb_off_t offset,
-				      const struct tdb_context *owner)
-{
-	unsigned int i;
-
-	for (i=0; i<tdb->file->num_lockrecs; i++) {
-		if (tdb->file->lockrecs[i].off == offset) {
-			if (owner && tdb->file->lockrecs[i].owner != owner)
-				return NULL;
-			return &tdb->file->lockrecs[i];
-		}
-	}
-	return NULL;
-}
-
-enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb)
-{
-	enum TDB_ERROR ecode;
-
-	if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
-		return TDB_ERR_LOCK;
-
-	ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK,
-				   false);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
-	if (ecode != TDB_SUCCESS) {
-		tdb_allrecord_unlock(tdb, F_WRLCK);
-		return ecode;
-	}
-	ecode = tdb_transaction_recover(tdb);
-	tdb_unlock_open(tdb, F_WRLCK);
-	tdb_allrecord_unlock(tdb, F_WRLCK);
-
-	return ecode;
-}
-
-/* lock an offset in the database. */
-static enum TDB_ERROR tdb_nest_lock(struct tdb_context *tdb,
-				    tdb_off_t offset, int ltype,
-				    enum tdb_lock_flags flags)
-{
-	struct tdb_lock *new_lck;
-	enum TDB_ERROR ecode;
-
-	if (offset > (TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE
-		      + tdb->file->map_size / 8)) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_nest_lock: invalid offset %zu ltype=%d",
-				  (size_t)offset, ltype);
-	}
-
-	if (tdb->flags & TDB_NOLOCK)
-		return TDB_SUCCESS;
-
-	if (!check_lock_pid(tdb, "tdb_nest_lock", true)) {
-		return TDB_ERR_LOCK;
-	}
-
-	tdb->stats.locks++;
-
-	new_lck = find_nestlock(tdb, offset, NULL);
-	if (new_lck) {
-		if (new_lck->owner != tdb) {
-			return owner_conflict(tdb, "tdb_nest_lock");
-		}
-
-		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
-			return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-					  "tdb_nest_lock:"
-					  " offset %zu has read lock",
-					  (size_t)offset);
-		}
-		/* Just increment the struct, posix locks don't stack. */
-		new_lck->count++;
-		return TDB_SUCCESS;
-	}
-
-#if 0
-	if (tdb->file->num_lockrecs
-	    && offset >= TDB_HASH_LOCK_START
-	    && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_nest_lock: already have a hash lock?");
-	}
-#endif
-
-	new_lck = (struct tdb_lock *)realloc(
-		tdb->file->lockrecs,
-		sizeof(*tdb->file->lockrecs) * (tdb->file->num_lockrecs+1));
-	if (new_lck == NULL) {
-		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-				  "tdb_nest_lock:"
-				  " unable to allocate %zu lock struct",
-				  tdb->file->num_lockrecs + 1);
-	}
-	tdb->file->lockrecs = new_lck;
-
-	/* Since fcntl locks don't nest, we do a lock for the first one,
-	   and simply bump the count for future ones */
-	ecode = tdb_brlock(tdb, ltype, offset, 1, flags);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	/* First time we grab a lock, perhaps someone died in commit? */
-	if (!(flags & TDB_LOCK_NOCHECK)
-	    && tdb->file->num_lockrecs == 0) {
-		tdb_bool_err berr = tdb_needs_recovery(tdb);
-		if (berr != false) {
-			tdb_brunlock(tdb, ltype, offset, 1);
-
-			if (berr < 0)
-				return TDB_OFF_TO_ERR(berr);
-			ecode = tdb_lock_and_recover(tdb);
-			if (ecode == TDB_SUCCESS) {
-				ecode = tdb_brlock(tdb, ltype, offset, 1,
-						   flags);
-			}
-			if (ecode != TDB_SUCCESS) {
-				return ecode;
-			}
-		}
-	}
-
-	tdb->file->lockrecs[tdb->file->num_lockrecs].owner = tdb;
-	tdb->file->lockrecs[tdb->file->num_lockrecs].off = offset;
-	tdb->file->lockrecs[tdb->file->num_lockrecs].count = 1;
-	tdb->file->lockrecs[tdb->file->num_lockrecs].ltype = ltype;
-	tdb->file->num_lockrecs++;
-
-	return TDB_SUCCESS;
-}
-
-static enum TDB_ERROR tdb_nest_unlock(struct tdb_context *tdb,
-				      tdb_off_t off, int ltype)
-{
-	struct tdb_lock *lck;
-	enum TDB_ERROR ecode;
-
-	if (tdb->flags & TDB_NOLOCK)
-		return TDB_SUCCESS;
-
-	lck = find_nestlock(tdb, off, tdb);
-	if ((lck == NULL) || (lck->count == 0)) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_nest_unlock: no lock for %zu",
-				  (size_t)off);
-	}
-
-	if (lck->count > 1) {
-		lck->count--;
-		return TDB_SUCCESS;
-	}
-
-	/*
-	 * This lock has count==1 left, so we need to unlock it in the
-	 * kernel. We don't bother with decrementing the in-memory array
-	 * element, we're about to overwrite it with the last array element
-	 * anyway.
-	 */
-	ecode = tdb_brunlock(tdb, ltype, off, 1);
-
-	/*
-	 * Shrink the array by overwriting the element just unlocked with the
-	 * last array element.
-	 */
-	*lck = tdb->file->lockrecs[--tdb->file->num_lockrecs];
-
-	return ecode;
-}
-
-/*
-  get the transaction lock
- */
-enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype)
-{
-	return tdb_nest_lock(tdb, TDB_TRANSACTION_LOCK, ltype, TDB_LOCK_WAIT);
-}
-
-/*
-  release the transaction lock
- */
-void tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
-{
-	tdb_nest_unlock(tdb, TDB_TRANSACTION_LOCK, ltype);
-}
-
-/* We only need to lock individual bytes, but Linux merges consecutive locks
- * so we lock in contiguous ranges. */
-static enum TDB_ERROR tdb_lock_gradual(struct tdb_context *tdb,
-				       int ltype, enum tdb_lock_flags flags,
-				       tdb_off_t off, tdb_off_t len)
-{
-	enum TDB_ERROR ecode;
-	enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
-
-	if (len <= 1) {
-		/* 0 would mean to end-of-file... */
-		assert(len != 0);
-		/* Single hash.  Just do blocking lock. */
-		return tdb_brlock(tdb, ltype, off, len, flags);
-	}
-
-	/* First we try non-blocking. */
-	ecode = tdb_brlock(tdb, ltype, off, len, nb_flags);
-	if (ecode != TDB_ERR_LOCK) {
-		return ecode;
-	}
-
-	/* Try locking first half, then second. */
-	ecode = tdb_lock_gradual(tdb, ltype, flags, off, len / 2);
-	if (ecode != TDB_SUCCESS)
-		return ecode;
-
-	ecode = tdb_lock_gradual(tdb, ltype, flags,
-				 off + len / 2, len - len / 2);
-	if (ecode != TDB_SUCCESS) {
-		tdb_brunlock(tdb, ltype, off, len / 2);
-	}
-	return ecode;
-}
-
-/* lock/unlock entire database.  It can only be upgradable if you have some
- * other way of guaranteeing exclusivity (ie. transaction write lock). */
-enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
-				  enum tdb_lock_flags flags, bool upgradable)
-{
-	enum TDB_ERROR ecode;
-	tdb_bool_err berr;
-
-	if (tdb->flags & TDB_NOLOCK)
-		return TDB_SUCCESS;
-
-	if (!check_lock_pid(tdb, "tdb_allrecord_lock", true)) {
-		return TDB_ERR_LOCK;
-	}
-
-	if (tdb->file->allrecord_lock.count) {
-		if (tdb->file->allrecord_lock.owner != tdb) {
-			return owner_conflict(tdb, "tdb_allrecord_lock");
-		}
-
-		if (ltype == F_RDLCK
-		    || tdb->file->allrecord_lock.ltype == F_WRLCK) {
-			tdb->file->allrecord_lock.count++;
-			return TDB_SUCCESS;
-		}
-
-		/* a global lock of a different type exists */
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
-				  "tdb_allrecord_lock: already have %s lock",
-				  tdb->file->allrecord_lock.ltype == F_RDLCK
-				  ? "read" : "write");
-	}
-
-	if (tdb_has_hash_locks(tdb)) {
-		/* can't combine global and chain locks */
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
-				  "tdb_allrecord_lock:"
-				  " already have chain lock");
-	}
-
-	if (upgradable && ltype != F_RDLCK) {
-		/* tdb error: you can't upgrade a write lock! */
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_allrecord_lock:"
-				  " can't upgrade a write lock");
-	}
-
-	tdb->stats.locks++;
-again:
-	/* Lock hashes, gradually. */
-	ecode = tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
-				 TDB_HASH_LOCK_RANGE);
-	if (ecode != TDB_SUCCESS)
-		return ecode;
-
-	/* Lock free tables: there to end of file. */
-	ecode = tdb_brlock(tdb, ltype,
-			   TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
-			   0, flags);
-	if (ecode != TDB_SUCCESS) {
-		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START,
-			     TDB_HASH_LOCK_RANGE);
-		return ecode;
-	}
-
-	tdb->file->allrecord_lock.owner = tdb;
-	tdb->file->allrecord_lock.count = 1;
-	/* If it's upgradable, it's actually exclusive so we can treat
-	 * it as a write lock. */
-	tdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
-	tdb->file->allrecord_lock.off = upgradable;
-
-	/* Now check for needing recovery. */
-	if (flags & TDB_LOCK_NOCHECK)
-		return TDB_SUCCESS;
-
-	berr = tdb_needs_recovery(tdb);
-	if (likely(berr == false))
-		return TDB_SUCCESS;
-
-	tdb_allrecord_unlock(tdb, ltype);
-	if (berr < 0)
-		return TDB_OFF_TO_ERR(berr);
-	ecode = tdb_lock_and_recover(tdb);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-	goto again;
-}
-
-enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb,
-			     int ltype, enum tdb_lock_flags flags)
-{
-	return tdb_nest_lock(tdb, TDB_OPEN_LOCK, ltype, flags);
-}
-
-void tdb_unlock_open(struct tdb_context *tdb, int ltype)
-{
-	tdb_nest_unlock(tdb, TDB_OPEN_LOCK, ltype);
-}
-
-bool tdb_has_open_lock(struct tdb_context *tdb)
-{
-	return !(tdb->flags & TDB_NOLOCK)
-		&& find_nestlock(tdb, TDB_OPEN_LOCK, tdb) != NULL;
-}
-
-enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype)
-{
-	/* Lock doesn't protect data, so don't check (we recurse if we do!) */
-	return tdb_nest_lock(tdb, TDB_EXPANSION_LOCK, ltype,
-			     TDB_LOCK_WAIT | TDB_LOCK_NOCHECK);
-}
-
-void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
-{
-	tdb_nest_unlock(tdb, TDB_EXPANSION_LOCK, ltype);
-}
-
-/* unlock entire db */
-void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
-{
-	if (tdb->flags & TDB_NOLOCK)
-		return;
-
-	if (tdb->file->allrecord_lock.count == 0) {
-		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
-			   "tdb_allrecord_unlock: not locked!");
-		return;
-	}
-
-	if (tdb->file->allrecord_lock.owner != tdb) {
-		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
-			   "tdb_allrecord_unlock: not locked by us!");
-		return;
-	}
-
-	/* Upgradable locks are marked as write locks. */
-	if (tdb->file->allrecord_lock.ltype != ltype
-	    && (!tdb->file->allrecord_lock.off || ltype != F_RDLCK)) {
-		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-			   "tdb_allrecord_unlock: have %s lock",
-			   tdb->file->allrecord_lock.ltype == F_RDLCK
-			   ? "read" : "write");
-		return;
-	}
-
-	if (tdb->file->allrecord_lock.count > 1) {
-		tdb->file->allrecord_lock.count--;
-		return;
-	}
-
-	tdb->file->allrecord_lock.count = 0;
-	tdb->file->allrecord_lock.ltype = 0;
-
-	tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 0);
-}
-
-bool tdb_has_expansion_lock(struct tdb_context *tdb)
-{
-	return find_nestlock(tdb, TDB_EXPANSION_LOCK, tdb) != NULL;
-}
-
-bool tdb_has_hash_locks(struct tdb_context *tdb)
-{
-	unsigned int i;
-
-	for (i=0; i<tdb->file->num_lockrecs; i++) {
-		if (tdb->file->lockrecs[i].off >= TDB_HASH_LOCK_START
-		    && tdb->file->lockrecs[i].off < (TDB_HASH_LOCK_START
-						     + TDB_HASH_LOCK_RANGE))
-			return true;
-	}
-	return false;
-}
-
-static bool tdb_has_free_lock(struct tdb_context *tdb)
-{
-	unsigned int i;
-
-	if (tdb->flags & TDB_NOLOCK)
-		return false;
-
-	for (i=0; i<tdb->file->num_lockrecs; i++) {
-		if (tdb->file->lockrecs[i].off
-		    > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE)
-			return true;
-	}
-	return false;
-}
-
-enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
-			       tdb_off_t hash_lock,
-			       tdb_len_t hash_range,
-			       int ltype, enum tdb_lock_flags waitflag)
-{
-	/* FIXME: Do this properly, using hlock_range */
-	unsigned l = TDB_HASH_LOCK_START
-		+ (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
-
-	/* a allrecord lock allows us to avoid per chain locks */
-	if (tdb->file->allrecord_lock.count) {
-		if (!check_lock_pid(tdb, "tdb_lock_hashes", true))
-			return TDB_ERR_LOCK;
-
-		if (tdb->file->allrecord_lock.owner != tdb)
-			return owner_conflict(tdb, "tdb_lock_hashes");
-		if (ltype == tdb->file->allrecord_lock.ltype
-		    || ltype == F_RDLCK) {
-			return TDB_SUCCESS;
-		}
-
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
-				  "tdb_lock_hashes:"
-				  " already have %s allrecordlock",
-				  tdb->file->allrecord_lock.ltype == F_RDLCK
-				  ? "read" : "write");
-	}
-
-	if (tdb_has_free_lock(tdb)) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_lock_hashes: already have free lock");
-	}
-
-	if (tdb_has_expansion_lock(tdb)) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_lock_hashes:"
-				  " already have expansion lock");
-	}
-
-	return tdb_nest_lock(tdb, l, ltype, waitflag);
-}
-
-enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
-				 tdb_off_t hash_lock,
-				 tdb_len_t hash_range, int ltype)
-{
-	unsigned l = TDB_HASH_LOCK_START
-		+ (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
-
-	if (tdb->flags & TDB_NOLOCK)
-		return 0;
-
-	/* a allrecord lock allows us to avoid per chain locks */
-	if (tdb->file->allrecord_lock.count) {
-		if (tdb->file->allrecord_lock.ltype == F_RDLCK
-		    && ltype == F_WRLCK) {
-			return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-					  "tdb_unlock_hashes RO allrecord!");
-		}
-		if (tdb->file->allrecord_lock.owner != tdb) {
-			return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
-					  "tdb_unlock_hashes:"
-					  " not locked by us!");
-		}
-		return TDB_SUCCESS;
-	}
-
-	return tdb_nest_unlock(tdb, l, ltype);
-}
-
-/* Hash locks use TDB_HASH_LOCK_START + the next 30 bits.
- * Then we begin; bucket offsets are sizeof(tdb_len_t) apart, so we divide.
- * The result is that on 32 bit systems we don't use lock values > 2^31 on
- * files that are less than 4GB.
- */
-static tdb_off_t free_lock_off(tdb_off_t b_off)
-{
-	return TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE
-		+ b_off / sizeof(tdb_off_t);
-}
-
-enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
-				    enum tdb_lock_flags waitflag)
-{
-	assert(b_off >= sizeof(struct tdb_header));
-
-	if (tdb->flags & TDB_NOLOCK)
-		return 0;
-
-	/* a allrecord lock allows us to avoid per chain locks */
-	if (tdb->file->allrecord_lock.count) {
-		if (!check_lock_pid(tdb, "tdb_lock_free_bucket", true))
-			return TDB_ERR_LOCK;
-
-		if (tdb->file->allrecord_lock.owner != tdb) {
-			return owner_conflict(tdb, "tdb_lock_free_bucket");
-		}
-
-		if (tdb->file->allrecord_lock.ltype == F_WRLCK)
-			return 0;
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_lock_free_bucket with"
-				  " read-only allrecordlock!");
-	}
-
-#if 0 /* FIXME */
-	if (tdb_has_expansion_lock(tdb)) {
-		return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
-				  "tdb_lock_free_bucket:"
-				  " already have expansion lock");
-	}
-#endif
-
-	return tdb_nest_lock(tdb, free_lock_off(b_off), F_WRLCK, waitflag);
-}
-
-void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off)
-{
-	if (tdb->file->allrecord_lock.count)
-		return;
-
-	tdb_nest_unlock(tdb, free_lock_off(b_off), F_WRLCK);
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_lockall(struct tdb_context *tdb)
-{
-	return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
-}
-
-_PUBLIC_ void tdb_unlockall(struct tdb_context *tdb)
-{
-	tdb_allrecord_unlock(tdb, F_WRLCK);
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb)
-{
-	return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
-}
-
-_PUBLIC_ void tdb_unlockall_read(struct tdb_context *tdb)
-{
-	tdb_allrecord_unlock(tdb, F_RDLCK);
-}
-
-void tdb_lock_cleanup(struct tdb_context *tdb)
-{
-	unsigned int i;
-
-	/* We don't want to warn: they're allowed to close tdb after fork. */
-	if (!check_lock_pid(tdb, "tdb_close", false))
-		return;
-
-	while (tdb->file->allrecord_lock.count
-	       && tdb->file->allrecord_lock.owner == tdb) {
-		tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
-	}
-
-	for (i=0; i<tdb->file->num_lockrecs; i++) {
-		if (tdb->file->lockrecs[i].owner == tdb) {
-			tdb_nest_unlock(tdb,
-					tdb->file->lockrecs[i].off,
-					tdb->file->lockrecs[i].ltype);
-			i--;
-		}
-	}
-}
diff --git a/lib/tdb2/open.c b/lib/tdb2/open.c
deleted file mode 100644
index fab855b6b8..0000000000
--- a/lib/tdb2/open.c
+++ /dev/null
@@ -1,768 +0,0 @@
- /*
-   Trivial Database 2: opening and closing TDBs
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/build_assert/build_assert.h>
-#include <assert.h>
-
-/* all tdbs, to detect double-opens (fcntl file don't nest!) */
-static struct tdb_context *tdbs = NULL;
-
-static struct tdb_file *find_file(dev_t device, ino_t ino)
-{
-	struct tdb_context *i;
-
-	for (i = tdbs; i; i = i->next) {
-		if (i->file->device == device && i->file->inode == ino) {
-			i->file->refcnt++;
-			return i->file;
-		}
-	}
-	return NULL;
-}
-
-static bool read_all(int fd, void *buf, size_t len)
-{
-	while (len) {
-		ssize_t ret;
-		ret = read(fd, buf, len);
-		if (ret < 0)
-			return false;
-		if (ret == 0) {
-			/* ETOOSHORT? */
-			errno = EWOULDBLOCK;
-			return false;
-		}
-		buf = (char *)buf + ret;
-		len -= ret;
-	}
-	return true;
-}
-
-static uint64_t random_number(struct tdb_context *tdb)
-{
-	int fd;
-	uint64_t ret = 0;
-	struct timeval now;
-
-	fd = open("/dev/urandom", O_RDONLY);
-	if (fd >= 0) {
-		if (read_all(fd, &ret, sizeof(ret))) {
-			close(fd);
-			return ret;
-		}
-		close(fd);
-	}
-	/* FIXME: Untested!  Based on Wikipedia protocol description! */
-	fd = open("/dev/egd-pool", O_RDWR);
-	if (fd >= 0) {
-		/* Command is 1, next byte is size we want to read. */
-		char cmd[2] = { 1, sizeof(uint64_t) };
-		if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
-			char reply[1 + sizeof(uint64_t)];
-			int r = read(fd, reply, sizeof(reply));
-			if (r > 1) {
-				/* Copy at least some bytes. */
-				memcpy(&ret, reply+1, r - 1);
-				if (reply[0] == sizeof(uint64_t)
-				    && r == sizeof(reply)) {
-					close(fd);
-					return ret;
-				}
-			}
-		}
-		close(fd);
-	}
-
-	/* Fallback: pid and time. */
-	gettimeofday(&now, NULL);
-	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
-	tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
-		   "tdb_open: random from getpid and time");
-	return ret;
-}
-
-static void tdb2_context_init(struct tdb_context *tdb)
-{
-	/* Initialize the TDB2 fields here */
-	tdb_io_init(tdb);
-	tdb->direct_access = 0;
-	tdb->transaction = NULL;
-	tdb->access = NULL;
-}
-
-struct new_database {
-	struct tdb_header hdr;
-	struct tdb_freetable ftable;
-};
-
-/* initialise a new database */
-static enum TDB_ERROR tdb_new_database(struct tdb_context *tdb,
-				       struct tdb_attribute_seed *seed,
-				       struct tdb_header *hdr)
-{
-	/* We make it up in memory, then write it out if not internal */
-	struct new_database newdb;
-	unsigned int magic_len;
-	ssize_t rlen;
-	enum TDB_ERROR ecode;
-
-	/* Fill in the header */
-	newdb.hdr.version = TDB_VERSION;
-	if (seed)
-		newdb.hdr.hash_seed = seed->seed;
-	else
-		newdb.hdr.hash_seed = random_number(tdb);
-	newdb.hdr.hash_test = TDB_HASH_MAGIC;
-	newdb.hdr.hash_test = tdb->hash_fn(&newdb.hdr.hash_test,
-					   sizeof(newdb.hdr.hash_test),
-					   newdb.hdr.hash_seed,
-					   tdb->hash_data);
-	newdb.hdr.recovery = 0;
-	newdb.hdr.features_used = newdb.hdr.features_offered = TDB_FEATURE_MASK;
-	newdb.hdr.seqnum = 0;
-	newdb.hdr.capabilities = 0;
-	memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved));
-	/* Initial hashes are empty. */
-	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
-
-	/* Free is empty. */
-	newdb.hdr.free_table = offsetof(struct new_database, ftable);
-	memset(&newdb.ftable, 0, sizeof(newdb.ftable));
-	ecode = set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0,
-			   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
-			   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
-			   0);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	/* Magic food */
-	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
-	strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
-
-	/* This creates an endian-converted database, as if read from disk */
-	magic_len = sizeof(newdb.hdr.magic_food);
-	tdb_convert(tdb,
-		    (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len);
-
-	*hdr = newdb.hdr;
-
-	if (tdb->flags & TDB_INTERNAL) {
-		tdb->file->map_size = sizeof(newdb);
-		tdb->file->map_ptr = malloc(tdb->file->map_size);
-		if (!tdb->file->map_ptr) {
-			return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-					  "tdb_new_database:"
-					  " failed to allocate");
-		}
-		memcpy(tdb->file->map_ptr, &newdb, tdb->file->map_size);
-		return TDB_SUCCESS;
-	}
-	if (lseek(tdb->file->fd, 0, SEEK_SET) == -1) {
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "tdb_new_database:"
-				  " failed to seek: %s", strerror(errno));
-	}
-
-	if (ftruncate(tdb->file->fd, 0) == -1) {
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "tdb_new_database:"
-				  " failed to truncate: %s", strerror(errno));
-	}
-
-	rlen = write(tdb->file->fd, &newdb, sizeof(newdb));
-	if (rlen != sizeof(newdb)) {
-		if (rlen >= 0)
-			errno = ENOSPC;
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "tdb_new_database: %zi writing header: %s",
-				  rlen, strerror(errno));
-	}
-	return TDB_SUCCESS;
-}
-
-static enum TDB_ERROR tdb_new_file(struct tdb_context *tdb)
-{
-	tdb->file = malloc(sizeof(*tdb->file));
-	if (!tdb->file)
-		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-				  "tdb_open: cannot alloc tdb_file structure");
-	tdb->file->num_lockrecs = 0;
-	tdb->file->lockrecs = NULL;
-	tdb->file->allrecord_lock.count = 0;
-	tdb->file->refcnt = 1;
-	tdb->file->map_ptr = NULL;
-	return TDB_SUCCESS;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb,
-				 const union tdb_attribute *attr)
-{
-	switch (attr->base.attr) {
-	case TDB_ATTRIBUTE_LOG:
-		tdb->log_fn = attr->log.fn;
-		tdb->log_data = attr->log.data;
-		break;
-	case TDB_ATTRIBUTE_HASH:
-	case TDB_ATTRIBUTE_SEED:
-	case TDB_ATTRIBUTE_OPENHOOK:
-		return tdb->last_error
-			= tdb_logerr(tdb, TDB_ERR_EINVAL,
-				     TDB_LOG_USE_ERROR,
-				     "tdb_set_attribute:"
-				     " cannot set %s after opening",
-				     attr->base.attr == TDB_ATTRIBUTE_HASH
-				     ? "TDB_ATTRIBUTE_HASH"
-				     : attr->base.attr == TDB_ATTRIBUTE_SEED
-				     ? "TDB_ATTRIBUTE_SEED"
-				     : "TDB_ATTRIBUTE_OPENHOOK");
-	case TDB_ATTRIBUTE_STATS:
-		return tdb->last_error
-			= tdb_logerr(tdb, TDB_ERR_EINVAL,
-				     TDB_LOG_USE_ERROR,
-				     "tdb_set_attribute:"
-				     " cannot set TDB_ATTRIBUTE_STATS");
-	case TDB_ATTRIBUTE_FLOCK:
-		tdb->lock_fn = attr->flock.lock;
-		tdb->unlock_fn = attr->flock.unlock;
-		tdb->lock_data = attr->flock.data;
-		break;
-	default:
-		return tdb->last_error
-			= tdb_logerr(tdb, TDB_ERR_EINVAL,
-				     TDB_LOG_USE_ERROR,
-				     "tdb_set_attribute:"
-				     " unknown attribute type %u",
-				     attr->base.attr);
-	}
-	return TDB_SUCCESS;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb,
-				 union tdb_attribute *attr)
-{
-	switch (attr->base.attr) {
-	case TDB_ATTRIBUTE_LOG:
-		if (!tdb->log_fn)
-			return tdb->last_error = TDB_ERR_NOEXIST;
-		attr->log.fn = tdb->log_fn;
-		attr->log.data = tdb->log_data;
-		break;
-	case TDB_ATTRIBUTE_HASH:
-		attr->hash.fn = tdb->hash_fn;
-		attr->hash.data = tdb->hash_data;
-		break;
-	case TDB_ATTRIBUTE_SEED:
-		attr->seed.seed = tdb->hash_seed;
-		break;
-	case TDB_ATTRIBUTE_OPENHOOK:
-		if (!tdb->openhook)
-			return tdb->last_error = TDB_ERR_NOEXIST;
-		attr->openhook.fn = tdb->openhook;
-		attr->openhook.data = tdb->openhook_data;
-		break;
-	case TDB_ATTRIBUTE_STATS: {
-		size_t size = attr->stats.size;
-		if (size > tdb->stats.size)
-			size = tdb->stats.size;
-		memcpy(&attr->stats, &tdb->stats, size);
-		break;
-	}
-	case TDB_ATTRIBUTE_FLOCK:
-		attr->flock.lock = tdb->lock_fn;
-		attr->flock.unlock = tdb->unlock_fn;
-		attr->flock.data = tdb->lock_data;
-		break;
-	default:
-		return tdb->last_error
-			= tdb_logerr(tdb, TDB_ERR_EINVAL,
-				     TDB_LOG_USE_ERROR,
-				     "tdb_get_attribute:"
-				     " unknown attribute type %u",
-				     attr->base.attr);
-	}
-	attr->base.next = NULL;
-	return TDB_SUCCESS;
-}
-
-_PUBLIC_ void tdb_unset_attribute(struct tdb_context *tdb,
-			 enum tdb_attribute_type type)
-{
-	switch (type) {
-	case TDB_ATTRIBUTE_LOG:
-		tdb->log_fn = NULL;
-		break;
-	case TDB_ATTRIBUTE_OPENHOOK:
-		tdb->openhook = NULL;
-		break;
-	case TDB_ATTRIBUTE_HASH:
-	case TDB_ATTRIBUTE_SEED:
-		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
-			   "tdb_unset_attribute: cannot unset %s after opening",
-			   type == TDB_ATTRIBUTE_HASH
-			   ? "TDB_ATTRIBUTE_HASH"
-			   : "TDB_ATTRIBUTE_SEED");
-		break;
-	case TDB_ATTRIBUTE_STATS:
-		tdb_logerr(tdb, TDB_ERR_EINVAL,
-			   TDB_LOG_USE_ERROR,
-			   "tdb_unset_attribute:"
-			   "cannot unset TDB_ATTRIBUTE_STATS");
-		break;
-	case TDB_ATTRIBUTE_FLOCK:
-		tdb->lock_fn = tdb_fcntl_lock;
-		tdb->unlock_fn = tdb_fcntl_unlock;
-		break;
-	default:
-		tdb_logerr(tdb, TDB_ERR_EINVAL,
-			   TDB_LOG_USE_ERROR,
-			   "tdb_unset_attribute: unknown attribute type %u",
-			   type);
-	}
-}
-
-/* The top three bits of the capability tell us whether it matters. */
-enum TDB_ERROR unknown_capability(struct tdb_context *tdb, const char *caller,
-				  tdb_off_t type)
-{
-	if (type & TDB_CAP_NOOPEN) {
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "%s: file has unknown capability %llu",
-				  caller, type & TDB_CAP_NOOPEN);
-	}
-
-	if ((type & TDB_CAP_NOWRITE) && !(tdb->flags & TDB_RDONLY)) {
-		return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_ERROR,
-				  "%s: file has unknown capability %llu"
-				  " (cannot write to it)",
-				  caller, type & TDB_CAP_NOOPEN);
-	}
-
-	if (type & TDB_CAP_NOCHECK) {
-		tdb->flags |= TDB_CANT_CHECK;
-	}
-	return TDB_SUCCESS;
-}
-
-static enum TDB_ERROR capabilities_ok(struct tdb_context *tdb,
-				      tdb_off_t capabilities)
-{
-	tdb_off_t off, next;
-	enum TDB_ERROR ecode = TDB_SUCCESS;
-	const struct tdb_capability *cap;
-
-	/* Check capability list. */
-	for (off = capabilities; off && ecode == TDB_SUCCESS; off = next) {
-		cap = tdb_access_read(tdb, off, sizeof(*cap), true);
-		if (TDB_PTR_IS_ERR(cap)) {
-			return TDB_PTR_ERR(cap);
-		}
-
-		switch (cap->type & TDB_CAP_TYPE_MASK) {
-		/* We don't understand any capabilities (yet). */
-		default:
-			ecode = unknown_capability(tdb, "tdb_open", cap->type);
-		}
-		next = cap->next;
-		tdb_access_release(tdb, cap);
-	}
-	return ecode;
-}
-
-_PUBLIC_ struct tdb_context *tdb_open(const char *name, int tdb_flags,
-			     int open_flags, mode_t mode,
-			     union tdb_attribute *attr)
-{
-	struct tdb_context *tdb;
-	struct stat st;
-	int saved_errno = 0;
-	uint64_t hash_test;
-	unsigned v;
-	ssize_t rlen;
-	struct tdb_header hdr;
-	struct tdb_attribute_seed *seed = NULL;
-	tdb_bool_err berr;
-	enum TDB_ERROR ecode;
-	int openlock;
-
-	tdb = malloc(sizeof(*tdb) + (name ? strlen(name) + 1 : 0));
-	if (!tdb) {
-		/* Can't log this */
-		errno = ENOMEM;
-		return NULL;
-	}
-	/* Set name immediately for logging functions. */
-	if (name) {
-		tdb->name = strcpy((char *)(tdb + 1), name);
-	} else {
-		tdb->name = NULL;
-	}
-	tdb->flags = tdb_flags;
-	tdb->log_fn = NULL;
-	tdb->open_flags = open_flags;
-	tdb->last_error = TDB_SUCCESS;
-	tdb->file = NULL;
-	tdb->openhook = NULL;
-	tdb->lock_fn = tdb_fcntl_lock;
-	tdb->unlock_fn = tdb_fcntl_unlock;
-	tdb->hash_fn = tdb_jenkins_hash;
-	memset(&tdb->stats, 0, sizeof(tdb->stats));
-	tdb->stats.base.attr = TDB_ATTRIBUTE_STATS;
-	tdb->stats.size = sizeof(tdb->stats);
-
-	while (attr) {
-		switch (attr->base.attr) {
-		case TDB_ATTRIBUTE_HASH:
-			tdb->hash_fn = attr->hash.fn;
-			tdb->hash_data = attr->hash.data;
-			break;
-		case TDB_ATTRIBUTE_SEED:
-			seed = &attr->seed;
-			break;
-		case TDB_ATTRIBUTE_OPENHOOK:
-			tdb->openhook = attr->openhook.fn;
-			tdb->openhook_data = attr->openhook.data;
-			break;
-		default:
-			/* These are set as normal. */
-			ecode = tdb_set_attribute(tdb, attr);
-			if (ecode != TDB_SUCCESS)
-				goto fail;
-		}
-		attr = attr->base.next;
-	}
-
-	if (tdb_flags & ~(TDB_INTERNAL | TDB_NOLOCK | TDB_NOMMAP | TDB_CONVERT
-			  | TDB_NOSYNC | TDB_SEQNUM | TDB_ALLOW_NESTING
-			  | TDB_RDONLY)) {
-		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
-				   "tdb_open: unknown flags %u", tdb_flags);
-		goto fail;
-	}
-
-	if (seed) {
-		if (!(tdb_flags & TDB_INTERNAL) && !(open_flags & O_CREAT)) {
-			ecode = tdb_logerr(tdb, TDB_ERR_EINVAL,
-					   TDB_LOG_USE_ERROR,
-					   "tdb_open:"
-					   " cannot set TDB_ATTRIBUTE_SEED"
-					   " without O_CREAT.");
-			goto fail;
-		}
-	}
-
-	if ((open_flags & O_ACCMODE) == O_WRONLY) {
-		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
-				   "tdb_open: can't open tdb %s write-only",
-				   name);
-		goto fail;
-	}
-
-	if ((open_flags & O_ACCMODE) == O_RDONLY) {
-		openlock = F_RDLCK;
-		tdb->flags |= TDB_RDONLY;
-	} else {
-		if (tdb_flags & TDB_RDONLY) {
-			ecode = tdb_logerr(tdb, TDB_ERR_EINVAL,
-					   TDB_LOG_USE_ERROR,
-					   "tdb_open: can't use TDB_RDONLY"
-					   " without O_RDONLY");
-			goto fail;
-		}
-		openlock = F_WRLCK;
-	}
-
-	/* internal databases don't need any of the rest. */
-	if (tdb->flags & TDB_INTERNAL) {
-		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
-		ecode = tdb_new_file(tdb);
-		if (ecode != TDB_SUCCESS) {
-			goto fail;
-		}
-		tdb->file->fd = -1;
-		ecode = tdb_new_database(tdb, seed, &hdr);
-		if (ecode == TDB_SUCCESS) {
-			tdb_convert(tdb, &hdr.hash_seed,
-				    sizeof(hdr.hash_seed));
-			tdb->hash_seed = hdr.hash_seed;
-			tdb2_context_init(tdb);
-			tdb_ftable_init(tdb);
-		}
-		if (ecode != TDB_SUCCESS) {
-			goto fail;
-		}
-		return tdb;
-	}
-
-	if (stat(name, &st) != -1)
-		tdb->file = find_file(st.st_dev, st.st_ino);
-
-	if (!tdb->file) {
-		int fd;
-
-		if ((fd = open(name, open_flags, mode)) == -1) {
-			/* errno set by open(2) */
-			saved_errno = errno;
-			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				   "tdb_open: could not open file %s: %s",
-				   name, strerror(errno));
-			goto fail_errno;
-		}
-
-		/* on exec, don't inherit the fd */
-		v = fcntl(fd, F_GETFD, 0);
-		fcntl(fd, F_SETFD, v | FD_CLOEXEC);
-
-		if (fstat(fd, &st) == -1) {
-			saved_errno = errno;
-			tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				   "tdb_open: could not stat open %s: %s",
-				   name, strerror(errno));
-			close(fd);
-			goto fail_errno;
-		}
-
-		ecode = tdb_new_file(tdb);
-		if (ecode != TDB_SUCCESS) {
-			close(fd);
-			goto fail;
-		}
-
-		tdb->file->fd = fd;
-		tdb->file->device = st.st_dev;
-		tdb->file->inode = st.st_ino;
-		tdb->file->map_ptr = NULL;
-		tdb->file->map_size = 0;
-	}
-
-	/* ensure there is only one process initialising at once */
-	ecode = tdb_lock_open(tdb, openlock, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
-	if (ecode != TDB_SUCCESS) {
-		saved_errno = errno;
-		goto fail_errno;
-	}
-
-	/* call their open hook if they gave us one. */
-	if (tdb->openhook) {
-		ecode = tdb->openhook(tdb->file->fd, tdb->openhook_data);
-		if (ecode != TDB_SUCCESS) {
-			tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				   "tdb_open: open hook failed");
-			goto fail;
-		}
-		open_flags |= O_CREAT;
-	}
-
-	/* If they used O_TRUNC, read will return 0. */
-	rlen = pread(tdb->file->fd, &hdr, sizeof(hdr), 0);
-	if (rlen == 0 && (open_flags & O_CREAT)) {
-		ecode = tdb_new_database(tdb, seed, &hdr);
-		if (ecode != TDB_SUCCESS) {
-			goto fail;
-		}
-	} else if (rlen < 0) {
-		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				   "tdb_open: error %s reading %s",
-				   strerror(errno), name);
-		goto fail;
-	} else if (rlen < sizeof(hdr)
-		   || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
-		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				   "tdb_open: %s is not a tdb2 file", name);
-		goto fail;
-	}
-
-	if (hdr.version != TDB_VERSION) {
-		if (hdr.version == bswap_64(TDB_VERSION))
-			tdb->flags |= TDB_CONVERT;
-		else {
-			/* wrong version */
-			ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-					   "tdb_open:"
-					   " %s is unknown version 0x%llx",
-					   name, (long long)hdr.version);
-			goto fail;
-		}
-	} else if (tdb->flags & TDB_CONVERT) {
-		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				   "tdb_open:"
-				   " %s does not need TDB_CONVERT",
-				   name);
-		goto fail;
-	}
-
-	tdb2_context_init(tdb);
-
-	tdb_convert(tdb, &hdr, sizeof(hdr));
-	tdb->hash_seed = hdr.hash_seed;
-	hash_test = TDB_HASH_MAGIC;
-	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
-	if (hdr.hash_test != hash_test) {
-		/* wrong hash variant */
-		ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				   "tdb_open:"
-				   " %s uses a different hash function",
-				   name);
-		goto fail;
-	}
-
-	ecode = capabilities_ok(tdb, hdr.capabilities);
-	if (ecode != TDB_SUCCESS) {
-		goto fail;
-	}
-
-	/* Clear any features we don't understand. */
-	if ((open_flags & O_ACCMODE) != O_RDONLY) {
-		hdr.features_used &= TDB_FEATURE_MASK;
-		ecode = tdb_write_convert(tdb, offsetof(struct tdb_header,
-							features_used),
-					  &hdr.features_used,
-					  sizeof(hdr.features_used));
-		if (ecode != TDB_SUCCESS)
-			goto fail;
-	}
-
-	tdb_unlock_open(tdb, openlock);
-
-	/* This makes sure we have current map_size and mmap. */
-	ecode = tdb->io->oob(tdb, tdb->file->map_size, 1, true);
-	if (unlikely(ecode != TDB_SUCCESS))
-		goto fail;
-
-	/* Now it's fully formed, recover if necessary. */
-	berr = tdb_needs_recovery(tdb);
-	if (unlikely(berr != false)) {
-		if (berr < 0) {
-			ecode = TDB_OFF_TO_ERR(berr);
-			goto fail;
-		}
-		ecode = tdb_lock_and_recover(tdb);
-		if (ecode != TDB_SUCCESS) {
-			goto fail;
-		}
-	}
-
-	ecode = tdb_ftable_init(tdb);
-	if (ecode != TDB_SUCCESS) {
-		goto fail;
-	}
-
-	tdb->next = tdbs;
-	tdbs = tdb;
-	return tdb;
-
- fail:
-	/* Map ecode to some logical errno. */
-	switch (TDB_ERR_TO_OFF(ecode)) {
-	case TDB_ERR_TO_OFF(TDB_ERR_CORRUPT):
-	case TDB_ERR_TO_OFF(TDB_ERR_IO):
-		saved_errno = EIO;
-		break;
-	case TDB_ERR_TO_OFF(TDB_ERR_LOCK):
-		saved_errno = EWOULDBLOCK;
-		break;
-	case TDB_ERR_TO_OFF(TDB_ERR_OOM):
-		saved_errno = ENOMEM;
-		break;
-	case TDB_ERR_TO_OFF(TDB_ERR_EINVAL):
-		saved_errno = EINVAL;
-		break;
-	default:
-		saved_errno = EINVAL;
-		break;
-	}
-
-fail_errno:
-#ifdef TDB_TRACE
-	close(tdb->tracefd);
-#endif
-	if (tdb->file) {
-		tdb_lock_cleanup(tdb);
-		if (--tdb->file->refcnt == 0) {
-			assert(tdb->file->num_lockrecs == 0);
-			if (tdb->file->map_ptr) {
-				if (tdb->flags & TDB_INTERNAL) {
-					free(tdb->file->map_ptr);
-				} else
-					tdb_munmap(tdb->file);
-			}
-			if (close(tdb->file->fd) != 0)
-				tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-					   "tdb_open: failed to close tdb fd"
-					   " on error: %s", strerror(errno));
-			free(tdb->file->lockrecs);
-			free(tdb->file);
-		}
-	}
-
-	free(tdb);
-	errno = saved_errno;
-	return NULL;
-}
-
-_PUBLIC_ int tdb_close(struct tdb_context *tdb)
-{
-	int ret = 0;
-	struct tdb_context **i;
-
-	tdb_trace(tdb, "tdb_close");
-
-	if (tdb->transaction) {
-		tdb_transaction_cancel(tdb);
-	}
-
-	if (tdb->file->map_ptr) {
-		if (tdb->flags & TDB_INTERNAL)
-			free(tdb->file->map_ptr);
-		else
-			tdb_munmap(tdb->file);
-	}
-	if (tdb->file) {
-		tdb_lock_cleanup(tdb);
-		if (--tdb->file->refcnt == 0) {
-			ret = close(tdb->file->fd);
-			free(tdb->file->lockrecs);
-			free(tdb->file);
-		}
-	}
-
-	/* Remove from tdbs list */
-	for (i = &tdbs; *i; i = &(*i)->next) {
-		if (*i == tdb) {
-			*i = tdb->next;
-			break;
-		}
-	}
-
-#ifdef TDB_TRACE
-	close(tdb->tracefd);
-#endif
-	free(tdb);
-
-	return ret;
-}
-
-_PUBLIC_ void tdb_foreach_(int (*fn)(struct tdb_context *, void *), void *p)
-{
-	struct tdb_context *i;
-
-	for (i = tdbs; i; i = i->next) {
-		if (fn(i, p) != 0)
-			break;
-	}
-}
diff --git a/lib/tdb2/private.h b/lib/tdb2/private.h
deleted file mode 100644
index 8c917a70b2..0000000000
--- a/lib/tdb2/private.h
+++ /dev/null
@@ -1,657 +0,0 @@
-#ifndef TDB_PRIVATE_H
-#define TDB_PRIVATE_H
- /*
-   Trivial Database 2: private types and prototypes
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "config.h"
-#ifndef HAVE_CCAN
-#error You need ccan to build tdb2!
-#endif
-#include "tdb2.h"
-#include <ccan/compiler/compiler.h>
-#include <ccan/likely/likely.h>
-#include <ccan/endian/endian.h>
-
-#ifdef HAVE_LIBREPLACE
-#include "replace.h"
-#include "system/filesys.h"
-#include "system/time.h"
-#include "system/shmem.h"
-#include "system/select.h"
-#include "system/wait.h"
-#else
-#include <stdint.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <sys/time.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdio.h>
-#include <utime.h>
-#include <unistd.h>
-#endif
-
-#ifndef TEST_IT
-#define TEST_IT(cond)
-#endif
-
-/* #define TDB_TRACE 1 */
-
-#ifndef __STRING
-#define __STRING(x)    #x
-#endif
-
-#ifndef __STRINGSTRING
-#define __STRINGSTRING(x) __STRING(x)
-#endif
-
-#ifndef __location__
-#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__)
-#endif
-
-typedef uint64_t tdb_len_t;
-typedef uint64_t tdb_off_t;
-
-#define TDB_MAGIC_FOOD "TDB file\n"
-#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
-#define TDB_USED_MAGIC ((uint64_t)0x1999)
-#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
-#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
-#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
-#define TDB_CAP_MAGIC ((uint64_t)0x1555)
-#define TDB_FREE_MAGIC ((uint64_t)0xFE)
-#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
-#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
-#define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
-
-/* Capability bits. */
-#define TDB_CAP_TYPE_MASK	0x1FFFFFFFFFFFFFFFULL
-#define TDB_CAP_NOCHECK		0x8000000000000000ULL
-#define TDB_CAP_NOWRITE		0x4000000000000000ULL
-#define TDB_CAP_NOOPEN		0x2000000000000000ULL
-
-#define TDB_OFF_IS_ERR(off) unlikely(off >= (tdb_off_t)(long)TDB_ERR_LAST)
-#define TDB_OFF_TO_ERR(off) ((enum TDB_ERROR)(long)(off))
-#define TDB_ERR_TO_OFF(ecode) ((tdb_off_t)(long)(ecode))
-
-/* Packing errors into pointers and v.v. */
-#define TDB_PTR_IS_ERR(ptr) \
-	unlikely((unsigned long)(ptr) >= (unsigned long)TDB_ERR_LAST)
-#define TDB_PTR_ERR(p) ((enum TDB_ERROR)(long)(p))
-#define TDB_ERR_PTR(err) ((void *)(long)(err))
-
-/* Common case of returning true, false or -ve error. */
-typedef int tdb_bool_err;
-
-/* Prevent others from opening the file. */
-#define TDB_OPEN_LOCK 0
-/* Expanding file. */
-#define TDB_EXPANSION_LOCK 2
-/* Doing a transaction. */
-#define TDB_TRANSACTION_LOCK 8
-/* Hash chain locks. */
-#define TDB_HASH_LOCK_START 64
-
-/* Range for hash locks. */
-#define TDB_HASH_LOCK_RANGE_BITS 30
-#define TDB_HASH_LOCK_RANGE (1 << TDB_HASH_LOCK_RANGE_BITS)
-
-/* We have 1024 entries in the top level. */
-#define TDB_TOPLEVEL_HASH_BITS 10
-/* And 64 entries in each sub-level: thus 64 bits exactly after 9 levels. */
-#define TDB_SUBLEVEL_HASH_BITS 6
-/* And 8 entries in each group, ie 8 groups per sublevel. */
-#define TDB_HASH_GROUP_BITS 3
-/* This is currently 10: beyond this we chain. */
-#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS)
-
-/* Extend file by least 100 times larger than needed. */
-#define TDB_EXTENSION_FACTOR 100
-
-/* We steal bits from the offsets to store hash info. */
-#define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
-/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
-#define TDB_OFF_UPPER_STEAL 8
-#define   TDB_OFF_UPPER_STEAL_EXTRA 7
-/* The bit number where we store extra hash bits. */
-#define TDB_OFF_HASH_EXTRA_BIT 57
-#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
-
-/* Additional features we understand.  Currently: none. */
-#define TDB_FEATURE_MASK ((uint64_t)0)
-
-/* The bit number where we store the extra hash bits. */
-/* Convenience mask to get actual offset. */
-#define TDB_OFF_MASK \
-	(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
-
-/* How many buckets in a free list: see size_to_bucket(). */
-#define TDB_FREE_BUCKETS (64 - TDB_OFF_UPPER_STEAL)
-
-/* We have to be able to fit a free record here. */
-#define TDB_MIN_DATA_LEN	\
-	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
-
-/* Indicates this entry is not on an flist (can happen during coalescing) */
-#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1)
-
-struct tdb_used_record {
-	/* For on-disk compatibility, we avoid bitfields:
-	   magic: 16,        (highest)
-	   key_len_bits: 5,
-	   extra_padding: 32
-	   hash_bits: 11
-	*/
-        uint64_t magic_and_meta;
-	/* The bottom key_len_bits*2 are key length, rest is data length. */
-        uint64_t key_and_data_len;
-};
-
-static inline unsigned rec_key_bits(const struct tdb_used_record *r)
-{
-	return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2;
-}
-
-static inline uint64_t rec_key_length(const struct tdb_used_record *r)
-{
-	return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1);
-}
-
-static inline uint64_t rec_data_length(const struct tdb_used_record *r)
-{
-	return r->key_and_data_len >> rec_key_bits(r);
-}
-
-static inline uint64_t rec_extra_padding(const struct tdb_used_record *r)
-{
-	return (r->magic_and_meta >> 11) & 0xFFFFFFFF;
-}
-
-static inline uint32_t rec_hash(const struct tdb_used_record *r)
-{
-	return r->magic_and_meta & ((1 << 11) - 1);
-}
-
-static inline uint16_t rec_magic(const struct tdb_used_record *r)
-{
-	return (r->magic_and_meta >> 48);
-}
-
-struct tdb_free_record {
-        uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
-        uint64_t ftable_and_len; /* Len not counting these two fields. */
-	/* This is why the minimum record size is 8 bytes.  */
-	uint64_t next;
-};
-
-static inline uint64_t frec_prev(const struct tdb_free_record *f)
-{
-	return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
-}
-
-static inline uint64_t frec_magic(const struct tdb_free_record *f)
-{
-	return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
-}
-
-static inline uint64_t frec_len(const struct tdb_free_record *f)
-{
-	return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
-}
-
-static inline unsigned frec_ftable(const struct tdb_free_record *f)
-{
-	return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL);
-}
-
-struct tdb_recovery_record {
-	uint64_t magic;
-	/* Length of record (add this header to get total length). */
-	uint64_t max_len;
-	/* Length used. */
-	uint64_t len;
-	/* Old length of file before transaction. */
-	uint64_t eof;
-};
-
-/* If we bottom out of the subhashes, we chain. */
-struct tdb_chain {
-	tdb_off_t rec[1 << TDB_HASH_GROUP_BITS];
-	tdb_off_t next;
-};
-
-/* this is stored at the front of every database */
-struct tdb_header {
-	char magic_food[64]; /* for /etc/magic */
-	/* FIXME: Make me 32 bit? */
-	uint64_t version; /* version of the code */
-	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
-	uint64_t hash_seed; /* "random" seed written at creation time. */
-	tdb_off_t free_table; /* (First) free table. */
-	tdb_off_t recovery; /* Transaction recovery area. */
-
-	uint64_t features_used; /* Features all writers understand */
-	uint64_t features_offered; /* Features offered */
-
-	uint64_t seqnum; /* Sequence number for TDB_SEQNUM */
-
-	tdb_off_t capabilities; /* Optional linked list of capabilities. */
-	tdb_off_t reserved[22];
-
-	/* Top level hash table. */
-	tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
-};
-
-struct tdb_freetable {
-	struct tdb_used_record hdr;
-	tdb_off_t next;
-	tdb_off_t buckets[TDB_FREE_BUCKETS];
-};
-
-struct tdb_capability {
-	struct tdb_used_record hdr;
-	tdb_off_t type;
-	tdb_off_t next;
-	/* ... */
-};
-
-/* Information about a particular (locked) hash entry. */
-struct hash_info {
-	/* Full hash value of entry. */
-	uint64_t h;
-	/* Start and length of lock acquired. */
-	tdb_off_t hlock_start;
-	tdb_len_t hlock_range;
-	/* Start of hash group. */
-	tdb_off_t group_start;
-	/* Bucket we belong in. */
-	unsigned int home_bucket;
-	/* Bucket we (or an empty space) were found in. */
-	unsigned int found_bucket;
-	/* How many bits of the hash are already used. */
-	unsigned int hash_used;
-	/* Current working group. */
-	tdb_off_t group[1 << TDB_HASH_GROUP_BITS];
-};
-
-struct traverse_info {
-	struct traverse_level {
-		tdb_off_t hashtable;
-		/* We ignore groups here, and treat it as a big array. */
-		unsigned entry;
-		unsigned int total_buckets;
-	} levels[TDB_MAX_LEVELS + 1];
-	unsigned int num_levels;
-	unsigned int toplevel_group;
-	/* This makes delete-everything-inside-traverse work as expected. */
-	tdb_off_t prev;
-};
-
-enum tdb_lock_flags {
-	/* WAIT == F_SETLKW, NOWAIT == F_SETLK */
-	TDB_LOCK_NOWAIT = 0,
-	TDB_LOCK_WAIT = 1,
-	/* If set, don't log an error on failure. */
-	TDB_LOCK_PROBE = 2,
-	/* If set, don't check for recovery (used by recovery code). */
-	TDB_LOCK_NOCHECK = 4,
-};
-
-struct tdb_lock {
-	struct tdb_context *owner;
-	off_t off;
-	uint32_t count;
-	uint32_t ltype;
-};
-
-/* This is only needed for tdb_access_commit, but used everywhere to
- * simplify. */
-struct tdb_access_hdr {
-	struct tdb_access_hdr *next;
-	tdb_off_t off;
-	tdb_len_t len;
-	bool convert;
-};
-
-struct tdb_file {
-	/* How many are sharing us? */
-	unsigned int refcnt;
-
-	/* Mmap (if any), or malloc (for TDB_INTERNAL). */
-	void *map_ptr;
-
-	/* How much space has been mapped (<= current file size) */
-	tdb_len_t map_size;
-
-	/* The file descriptor (-1 for TDB_INTERNAL). */
-	int fd;
-
-	/* Lock information */
-	pid_t locker;
-	struct tdb_lock allrecord_lock;
-	size_t num_lockrecs;
-	struct tdb_lock *lockrecs;
-
-	/* Identity of this file. */
-	dev_t device;
-	ino_t inode;
-};
-
-struct tdb_methods {
-	enum TDB_ERROR (*tread)(struct tdb_context *, tdb_off_t, void *,
-				tdb_len_t);
-	enum TDB_ERROR (*twrite)(struct tdb_context *, tdb_off_t, const void *,
-				 tdb_len_t);
-	enum TDB_ERROR (*oob)(struct tdb_context *, tdb_off_t, tdb_len_t, bool);
-	enum TDB_ERROR (*expand_file)(struct tdb_context *, tdb_len_t);
-	void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool);
-};
-
-/*
-  internal prototypes
-*/
-/* hash.c: */
-uint64_t tdb_jenkins_hash(const void *key, size_t length, uint64_t seed,
-			  void *unused);
-
-enum TDB_ERROR first_in_hash(struct tdb_context *tdb,
-			     struct traverse_info *tinfo,
-			     TDB_DATA *kbuf, size_t *dlen);
-
-enum TDB_ERROR next_in_hash(struct tdb_context *tdb,
-			    struct traverse_info *tinfo,
-			    TDB_DATA *kbuf, size_t *dlen);
-
-/* Hash random memory. */
-uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len);
-
-/* Hash on disk. */
-uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off);
-
-/* Find and lock a hash entry (or where it would be). */
-tdb_off_t find_and_lock(struct tdb_context *tdb,
-			struct tdb_data key,
-			int ltype,
-			struct hash_info *h,
-			struct tdb_used_record *rec,
-			struct traverse_info *tinfo);
-
-enum TDB_ERROR replace_in_hash(struct tdb_context *tdb,
-			       struct hash_info *h,
-			       tdb_off_t new_off);
-
-enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h,
-			   tdb_off_t new_off);
-
-enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
-
-/* For tdb_check */
-bool is_subhash(tdb_off_t val);
-enum TDB_ERROR unknown_capability(struct tdb_context *tdb, const char *caller,
-				  tdb_off_t type);
-
-/* free.c: */
-enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb);
-
-/* check.c needs these to iterate through free lists. */
-tdb_off_t first_ftable(struct tdb_context *tdb);
-tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable);
-
-/* This returns space or -ve error number. */
-tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
-		uint64_t hash, unsigned magic, bool growing);
-
-/* Put this record in a free list. */
-enum TDB_ERROR add_free_record(struct tdb_context *tdb,
-			       tdb_off_t off, tdb_len_t len_with_header,
-			       enum tdb_lock_flags waitflag,
-			       bool coalesce_ok);
-
-/* Set up header for a used/ftable/htable/chain/capability record. */
-enum TDB_ERROR set_header(struct tdb_context *tdb,
-			  struct tdb_used_record *rec,
-			  unsigned magic, uint64_t keylen, uint64_t datalen,
-			  uint64_t actuallen, unsigned hashlow);
-
-/* Used by tdb_check to verify. */
-unsigned int size_to_bucket(tdb_len_t data_len);
-tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket);
-
-/* Used by tdb_summary */
-tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off);
-
-/* Adjust expansion, used by create_recovery_area */
-tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size);
-
-/* io.c: */
-/* Initialize tdb->methods. */
-void tdb_io_init(struct tdb_context *tdb);
-
-/* Convert endian of the buffer if required. */
-void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
-
-/* Unmap and try to map the tdb. */
-void tdb_munmap(struct tdb_file *file);
-enum TDB_ERROR tdb_mmap(struct tdb_context *tdb);
-
-/* Either alloc a copy, or give direct access.  Release frees or noop. */
-const void *tdb_access_read(struct tdb_context *tdb,
-			    tdb_off_t off, tdb_len_t len, bool convert);
-void *tdb_access_write(struct tdb_context *tdb,
-		       tdb_off_t off, tdb_len_t len, bool convert);
-
-/* Release result of tdb_access_read/write. */
-void tdb_access_release(struct tdb_context *tdb, const void *p);
-/* Commit result of tdb_acces_write. */
-enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p);
-
-/* Convenience routine to get an offset. */
-tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off);
-
-/* Write an offset at an offset. */
-enum TDB_ERROR tdb_write_off(struct tdb_context *tdb, tdb_off_t off,
-			     tdb_off_t val);
-
-/* Clear an ondisk area. */
-enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len);
-
-/* Return a non-zero offset between >= start < end in this array (or end). */
-tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb,
-			       tdb_off_t base,
-			       uint64_t start,
-			       uint64_t end);
-
-/* Return a zero offset in this array, or num. */
-tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
-			    uint64_t num);
-
-/* Allocate and make a copy of some offset. */
-void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
-
-/* Writes a converted copy of a record. */
-enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
-				 const void *rec, size_t len);
-
-/* Reads record and converts it */
-enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
-				void *rec, size_t len);
-
-/* Bump the seqnum (caller checks for tdb->flags & TDB_SEQNUM) */
-void tdb_inc_seqnum(struct tdb_context *tdb);
-
-/* lock.c: */
-/* Print message because another tdb owns a lock we want. */
-enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call);
-
-/* If we fork, we no longer really own locks. */
-bool check_lock_pid(struct tdb_context *tdb, const char *call, bool log);
-
-/* Lock/unlock a range of hashes. */
-enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
-			       tdb_off_t hash_lock, tdb_len_t hash_range,
-			       int ltype, enum tdb_lock_flags waitflag);
-enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
-				 tdb_off_t hash_lock,
-				 tdb_len_t hash_range, int ltype);
-
-/* For closing the file. */
-void tdb_lock_cleanup(struct tdb_context *tdb);
-
-/* Lock/unlock a particular free bucket. */
-enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
-				    enum tdb_lock_flags waitflag);
-void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off);
-
-/* Serialize transaction start. */
-enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype);
-void tdb_transaction_unlock(struct tdb_context *tdb, int ltype);
-
-/* Do we have any hash locks (ie. via tdb_chainlock) ? */
-bool tdb_has_hash_locks(struct tdb_context *tdb);
-
-/* Lock entire database. */
-enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
-				  enum tdb_lock_flags flags, bool upgradable);
-void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
-enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb, off_t start);
-
-/* Serialize db open. */
-enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb,
-			     int ltype, enum tdb_lock_flags flags);
-void tdb_unlock_open(struct tdb_context *tdb, int ltype);
-bool tdb_has_open_lock(struct tdb_context *tdb);
-
-/* Serialize db expand. */
-enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype);
-void tdb_unlock_expand(struct tdb_context *tdb, int ltype);
-bool tdb_has_expansion_lock(struct tdb_context *tdb);
-
-/* If it needs recovery, grab all the locks and do it. */
-enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb);
-
-/* Default lock and unlock functions. */
-int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, void *);
-int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *);
-
-/* transaction.c: */
-enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb);
-tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb);
-
-struct tdb_context {
-	/* Single list of all TDBs, to detect multiple opens. */
-	struct tdb_context *next;
-
-	/* Filename of the database. */
-	const char *name;
-
-	/* Logging function */
-	void (*log_fn)(struct tdb_context *tdb,
-		       enum tdb_log_level level,
-		       enum TDB_ERROR ecode,
-		       const char *message,
-		       void *data);
-	void *log_data;
-
-	/* Open flags passed to tdb_open. */
-	int open_flags;
-
-	/* low level (fnctl) lock functions. */
-	int (*lock_fn)(int fd, int rw, off_t off, off_t len, bool w, void *);
-	int (*unlock_fn)(int fd, int rw, off_t off, off_t len, void *);
-	void *lock_data;
-
-	/* the tdb flags passed to tdb_open. */
-	uint32_t flags;
-
-	/* Our statistics. */
-	struct tdb_attribute_stats stats;
-
-	/* The actual file information */
-	struct tdb_file *file;
-
-	/* Hash function. */
-	uint64_t (*hash_fn)(const void *key, size_t len, uint64_t seed, void *);
-	void *hash_data;
-	uint64_t hash_seed;
-
-	/* Our open hook, if any. */
-	enum TDB_ERROR (*openhook)(int fd, void *data);
-	void *openhook_data;
-
-	/* Last error we returned. */
-	enum TDB_ERROR last_error;
-
-	/* Are we accessing directly? (debugging check). */
-	int direct_access;
-
-	/* Set if we are in a transaction. */
-	struct tdb_transaction *transaction;
-
-	/* What free table are we using? */
-	tdb_off_t ftable_off;
-	unsigned int ftable;
-
-	/* IO methods: changes for transactions. */
-	const struct tdb_methods *io;
-
-	/* Direct access information */
-	struct tdb_access_hdr *access;
-};
-
-/* tdb.c: */
-enum TDB_ERROR COLD PRINTF_FMT(4, 5)
-	tdb_logerr(struct tdb_context *tdb,
-		   enum TDB_ERROR ecode,
-		   enum tdb_log_level level,
-		   const char *fmt, ...);
-
-#ifdef TDB_TRACE
-void tdb_trace(struct tdb_context *tdb, const char *op);
-void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
-void tdb_trace_open(struct tdb_context *tdb, const char *op,
-		    unsigned hash_size, unsigned tdb_flags, unsigned open_flags);
-void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret);
-void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret);
-void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
-		    TDB_DATA rec);
-void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
-			TDB_DATA rec, int ret);
-void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
-			   TDB_DATA rec, TDB_DATA ret);
-void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
-			     TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
-			     int ret);
-void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
-			   TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret);
-#else
-#define tdb_trace(tdb, op)
-#define tdb_trace_seqnum(tdb, seqnum, op)
-#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags)
-#define tdb_trace_ret(tdb, op, ret)
-#define tdb_trace_retrec(tdb, op, ret)
-#define tdb_trace_1rec(tdb, op, rec)
-#define tdb_trace_1rec_ret(tdb, op, rec, ret)
-#define tdb_trace_1rec_retrec(tdb, op, rec, ret)
-#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret)
-#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret)
-#endif /* !TDB_TRACE */
-
-#endif
diff --git a/lib/tdb2/pytdb.c b/lib/tdb2/pytdb.c
deleted file mode 100644
index 1fa4e5828b..0000000000
--- a/lib/tdb2/pytdb.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
-   Unix SMB/CIFS implementation.
-
-   Python interface to tdb2.  Simply modified from tdb1 version.
-
-   Copyright (C) 2004-2006 Tim Potter <tpot@samba.org>
-   Copyright (C) 2007-2008 Jelmer Vernooij <jelmer@samba.org>
-   Copyright (C) 2011 Rusty Russell <rusty@rustcorp.com.au>
-
-     ** NOTE! The following LGPL license applies to the tdb
-     ** library. This does NOT imply that all of Samba is released
-     ** under the LGPL
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <Python.h>
-#include "replace.h"
-#include "system/filesys.h"
-
-#ifndef Py_RETURN_NONE
-#define Py_RETURN_NONE return Py_INCREF(Py_None), Py_None
-#endif
-
-/* Include tdb headers */
-#include <tdb2.h>
-
-typedef struct {
-	PyObject_HEAD
-	struct tdb_context *ctx;
-	bool closed;
-} PyTdbObject;
-
-staticforward PyTypeObject PyTdb;
-
-static void PyErr_SetTDBError(enum TDB_ERROR e)
-{
-	PyErr_SetObject(PyExc_RuntimeError,
-		Py_BuildValue("(i,s)", e, tdb_errorstr(e)));
-}
-
-static TDB_DATA PyString_AsTDB_DATA(PyObject *data)
-{
-	TDB_DATA ret;
-	ret.dptr = (unsigned char *)PyString_AsString(data);
-	ret.dsize = PyString_Size(data);
-	return ret;
-}
-
-static PyObject *PyString_FromTDB_DATA(TDB_DATA data)
-{
-	PyObject *ret = PyString_FromStringAndSize((const char *)data.dptr,
-						   data.dsize);
-	free(data.dptr);
-	return ret;
-}
-
-#define PyErr_TDB_ERROR_IS_ERR_RAISE(ret) \
-	if (ret != TDB_SUCCESS) { \
-		PyErr_SetTDBError(ret); \
-		return NULL; \
-	}
-
-static void stderr_log(struct tdb_context *tdb,
-		       enum tdb_log_level level,
-		       enum TDB_ERROR ecode,
-		       const char *message,
-		       void *data)
-{
-	fprintf(stderr, "%s:%s:%s\n",
-		tdb_name(tdb), tdb_errorstr(ecode), message);
-}
-
-static PyObject *py_tdb_open(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-	char *name = NULL;
-	int tdb_flags = TDB_DEFAULT, flags = O_RDWR, mode = 0600;
-	struct tdb_context *ctx;
-	PyTdbObject *ret;
-	union tdb_attribute logattr;
-	const char *kwnames[] = { "name", "tdb_flags", "flags", "mode", NULL };
-
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|siii", cast_const2(char **, kwnames), &name, &tdb_flags, &flags, &mode))
-		return NULL;
-
-	if (name == NULL) {
-		tdb_flags |= TDB_INTERNAL;
-	}
-
-	logattr.log.base.attr = TDB_ATTRIBUTE_LOG;
-	logattr.log.base.next = NULL;
-	logattr.log.fn = stderr_log;
-	ctx = tdb_open(name, tdb_flags, flags, mode, &logattr);
-	if (ctx == NULL) {
-		PyErr_SetFromErrno(PyExc_IOError);
-		return NULL;
-	}
-
-	ret = PyObject_New(PyTdbObject, &PyTdb);
-	if (!ret) {
-		tdb_close(ctx);
-		return NULL;
-	}
-
-	ret->ctx = ctx;
-	ret->closed = false;
-	return (PyObject *)ret;
-}
-
-static PyObject *obj_transaction_cancel(PyTdbObject *self)
-{
-	tdb_transaction_cancel(self->ctx);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_transaction_commit(PyTdbObject *self)
-{
-	enum TDB_ERROR ret = tdb_transaction_commit(self->ctx);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_transaction_prepare_commit(PyTdbObject *self)
-{
-	enum TDB_ERROR ret = tdb_transaction_prepare_commit(self->ctx);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_transaction_start(PyTdbObject *self)
-{
-	enum TDB_ERROR ret = tdb_transaction_start(self->ctx);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_lockall(PyTdbObject *self)
-{
-	enum TDB_ERROR ret = tdb_lockall(self->ctx);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_unlockall(PyTdbObject *self)
-{
-	tdb_unlockall(self->ctx);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_lockall_read(PyTdbObject *self)
-{
-	enum TDB_ERROR ret = tdb_lockall_read(self->ctx);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_unlockall_read(PyTdbObject *self)
-{
-	tdb_unlockall_read(self->ctx);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_close(PyTdbObject *self)
-{
-	int ret;
-	if (self->closed)
-		Py_RETURN_NONE;
-	ret = tdb_close(self->ctx);
-	self->closed = true;
-	if (ret != 0) {
-		PyErr_SetTDBError(TDB_ERR_IO);
-		return NULL;
-	}
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_get(PyTdbObject *self, PyObject *args)
-{
-	TDB_DATA key, data;
-	PyObject *py_key;
-	enum TDB_ERROR ret;
-	if (!PyArg_ParseTuple(args, "O", &py_key))
-		return NULL;
-
-	key = PyString_AsTDB_DATA(py_key);
-	ret = tdb_fetch(self->ctx, key, &data);
-	if (ret == TDB_ERR_NOEXIST)
-		Py_RETURN_NONE;
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	return PyString_FromTDB_DATA(data);
-}
-
-static PyObject *obj_append(PyTdbObject *self, PyObject *args)
-{
-	TDB_DATA key, data;
-	PyObject *py_key, *py_data;
-	enum TDB_ERROR ret;
-	if (!PyArg_ParseTuple(args, "OO", &py_key, &py_data))
-		return NULL;
-
-	key = PyString_AsTDB_DATA(py_key);
-	data = PyString_AsTDB_DATA(py_data);
-
-	ret = tdb_append(self->ctx, key, data);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_firstkey(PyTdbObject *self)
-{
-	enum TDB_ERROR ret;
-	TDB_DATA key;
-
-	ret = tdb_firstkey(self->ctx, &key);
-	if (ret == TDB_ERR_NOEXIST)
-		Py_RETURN_NONE;
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-
-	return PyString_FromTDB_DATA(key);
-}
-
-static PyObject *obj_nextkey(PyTdbObject *self, PyObject *args)
-{
-	TDB_DATA key;
-	PyObject *py_key;
-	enum TDB_ERROR ret;
-	if (!PyArg_ParseTuple(args, "O", &py_key))
-		return NULL;
-
-	/* Malloc here, since tdb_nextkey frees. */
-	key.dsize = PyString_Size(py_key);
-	key.dptr = malloc(key.dsize);
-	memcpy(key.dptr, PyString_AsString(py_key), key.dsize);
-
-	ret = tdb_nextkey(self->ctx, &key);
-	if (ret == TDB_ERR_NOEXIST)
-		Py_RETURN_NONE;
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-
-	return PyString_FromTDB_DATA(key);
-}
-
-static PyObject *obj_delete(PyTdbObject *self, PyObject *args)
-{
-	TDB_DATA key;
-	PyObject *py_key;
-	enum TDB_ERROR ret;
-	if (!PyArg_ParseTuple(args, "O", &py_key))
-		return NULL;
-
-	key = PyString_AsTDB_DATA(py_key);
-	ret = tdb_delete(self->ctx, key);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_has_key(PyTdbObject *self, PyObject *args)
-{
-	TDB_DATA key;
-	PyObject *py_key;
-	if (!PyArg_ParseTuple(args, "O", &py_key))
-		return NULL;
-
-	key = PyString_AsTDB_DATA(py_key);
-	if (tdb_exists(self->ctx, key))
-		return Py_True;
-	if (tdb_error(self->ctx) != TDB_ERR_NOEXIST)
-		PyErr_TDB_ERROR_IS_ERR_RAISE(tdb_error(self->ctx));
-	return Py_False;
-}
-
-static PyObject *obj_store(PyTdbObject *self, PyObject *args)
-{
-	TDB_DATA key, value;
-	enum TDB_ERROR ret;
-	int flag = TDB_REPLACE;
-	PyObject *py_key, *py_value;
-
-	if (!PyArg_ParseTuple(args, "OO|i", &py_key, &py_value, &flag))
-		return NULL;
-
-	key = PyString_AsTDB_DATA(py_key);
-	value = PyString_AsTDB_DATA(py_value);
-
-	ret = tdb_store(self->ctx, key, value, flag);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_add_flag(PyTdbObject *self, PyObject *args)
-{
-	unsigned flag;
-
-	if (!PyArg_ParseTuple(args, "I", &flag))
-		return NULL;
-
-	tdb_add_flag(self->ctx, flag);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_remove_flag(PyTdbObject *self, PyObject *args)
-{
-	unsigned flag;
-
-	if (!PyArg_ParseTuple(args, "I", &flag))
-		return NULL;
-
-	tdb_remove_flag(self->ctx, flag);
-	Py_RETURN_NONE;
-}
-
-typedef struct {
-	PyObject_HEAD
-	TDB_DATA current;
-	bool end;
-	PyTdbObject *iteratee;
-} PyTdbIteratorObject;
-
-static PyObject *tdb_iter_next(PyTdbIteratorObject *self)
-{
-	enum TDB_ERROR e;
-	PyObject *ret;
-	if (self->end)
-		return NULL;
-	ret = PyString_FromStringAndSize((const char *)self->current.dptr,
-					 self->current.dsize);
-	e = tdb_nextkey(self->iteratee->ctx, &self->current);
-	if (e == TDB_ERR_NOEXIST)
-		self->end = true;
-	else
-		PyErr_TDB_ERROR_IS_ERR_RAISE(e);
-	return ret;
-}
-
-static void tdb_iter_dealloc(PyTdbIteratorObject *self)
-{
-	Py_DECREF(self->iteratee);
-	PyObject_Del(self);
-}
-
-PyTypeObject PyTdbIterator = {
-	.tp_name = "Iterator",
-	.tp_basicsize = sizeof(PyTdbIteratorObject),
-	.tp_iternext = (iternextfunc)tdb_iter_next,
-	.tp_dealloc = (destructor)tdb_iter_dealloc,
-	.tp_flags = Py_TPFLAGS_DEFAULT,
-	.tp_iter = PyObject_SelfIter,
-};
-
-static PyObject *tdb_object_iter(PyTdbObject *self)
-{
-	PyTdbIteratorObject *ret;
-	enum TDB_ERROR e;
-
-	ret = PyObject_New(PyTdbIteratorObject, &PyTdbIterator);
-	if (!ret)
-		return NULL;
-	e = tdb_firstkey(self->ctx, &ret->current);
-	if (e == TDB_ERR_NOEXIST) {
-		ret->end = true;
-	} else {
-		PyErr_TDB_ERROR_IS_ERR_RAISE(e);
-		ret->end = false;
-	}
-	ret->iteratee = self;
-	Py_INCREF(self);
-	return (PyObject *)ret;
-}
-
-static PyObject *obj_clear(PyTdbObject *self)
-{
-	enum TDB_ERROR ret = tdb_wipe_all(self->ctx);
-	PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-	Py_RETURN_NONE;
-}
-
-static PyObject *obj_enable_seqnum(PyTdbObject *self)
-{
-	tdb_add_flag(self->ctx, TDB_SEQNUM);
-	Py_RETURN_NONE;
-}
-
-static PyMethodDef tdb_object_methods[] = {
-	{ "transaction_cancel", (PyCFunction)obj_transaction_cancel, METH_NOARGS,
-		"S.transaction_cancel() -> None\n"
-		"Cancel the currently active transaction." },
-	{ "transaction_commit", (PyCFunction)obj_transaction_commit, METH_NOARGS,
-		"S.transaction_commit() -> None\n"
-		"Commit the currently active transaction." },
-	{ "transaction_prepare_commit", (PyCFunction)obj_transaction_prepare_commit, METH_NOARGS,
-		"S.transaction_prepare_commit() -> None\n"
-		"Prepare to commit the currently active transaction" },
-	{ "transaction_start", (PyCFunction)obj_transaction_start, METH_NOARGS,
-		"S.transaction_start() -> None\n"
-		"Start a new transaction." },
-	{ "lock_all", (PyCFunction)obj_lockall, METH_NOARGS, NULL },
-	{ "unlock_all", (PyCFunction)obj_unlockall, METH_NOARGS, NULL },
-	{ "read_lock_all", (PyCFunction)obj_lockall_read, METH_NOARGS, NULL },
-	{ "read_unlock_all", (PyCFunction)obj_unlockall_read, METH_NOARGS, NULL },
-	{ "close", (PyCFunction)obj_close, METH_NOARGS, NULL },
-	{ "get", (PyCFunction)obj_get, METH_VARARGS, "S.get(key) -> value\n"
-		"Fetch a value." },
-	{ "append", (PyCFunction)obj_append, METH_VARARGS, "S.append(key, value) -> None\n"
-		"Append data to an existing key." },
-	{ "firstkey", (PyCFunction)obj_firstkey, METH_NOARGS, "S.firstkey() -> data\n"
-		"Return the first key in this database." },
-	{ "nextkey", (PyCFunction)obj_nextkey, METH_NOARGS, "S.nextkey(key) -> data\n"
-		"Return the next key in this database." },
-	{ "delete", (PyCFunction)obj_delete, METH_VARARGS, "S.delete(key) -> None\n"
-		"Delete an entry." },
-	{ "has_key", (PyCFunction)obj_has_key, METH_VARARGS, "S.has_key(key) -> None\n"
-		"Check whether key exists in this database." },
-	{ "store", (PyCFunction)obj_store, METH_VARARGS, "S.store(key, data, flag=REPLACE) -> None"
-		"Store data." },
-	{ "add_flag", (PyCFunction)obj_add_flag, METH_VARARGS, "S.add_flag(flag) -> None" },
-	{ "remove_flag", (PyCFunction)obj_remove_flag, METH_VARARGS, "S.remove_flag(flag) -> None" },
-	{ "iterkeys", (PyCFunction)tdb_object_iter, METH_NOARGS, "S.iterkeys() -> iterator" },
-	{ "clear", (PyCFunction)obj_clear, METH_NOARGS, "S.clear() -> None\n"
-		"Wipe the entire database." },
-	{ "enable_seqnum", (PyCFunction)obj_enable_seqnum, METH_NOARGS,
-		"S.enable_seqnum() -> None" },
-	{ NULL }
-};
-
-static PyObject *obj_get_flags(PyTdbObject *self, void *closure)
-{
-	return PyInt_FromLong(tdb_get_flags(self->ctx));
-}
-
-static PyObject *obj_get_filename(PyTdbObject *self, void *closure)
-{
-	return PyString_FromString(tdb_name(self->ctx));
-}
-
-static PyObject *obj_get_seqnum(PyTdbObject *self, void *closure)
-{
-	return PyInt_FromLong(tdb_get_seqnum(self->ctx));
-}
-
-
-static PyGetSetDef tdb_object_getsetters[] = {
-	{ cast_const(char *, "flags"), (getter)obj_get_flags, NULL, NULL },
-	{ cast_const(char *, "filename"), (getter)obj_get_filename, NULL,
-	  cast_const(char *, "The filename of this TDB file.")},
-	{ cast_const(char *, "seqnum"), (getter)obj_get_seqnum, NULL, NULL },
-	{ NULL }
-};
-
-static PyObject *tdb_object_repr(PyTdbObject *self)
-{
-	if (tdb_get_flags(self->ctx) & TDB_INTERNAL) {
-		return PyString_FromString("Tdb(<internal>)");
-	} else {
-		return PyString_FromFormat("Tdb('%s')", tdb_name(self->ctx));
-	}
-}
-
-static void tdb_object_dealloc(PyTdbObject *self)
-{
-	if (!self->closed)
-		tdb_close(self->ctx);
-	self->ob_type->tp_free(self);
-}
-
-static PyObject *obj_getitem(PyTdbObject *self, PyObject *key)
-{
-	TDB_DATA tkey, val;
-	enum TDB_ERROR ret;
-
-	if (!PyString_Check(key)) {
-		PyErr_SetString(PyExc_TypeError, "Expected string as key");
-		return NULL;
-	}
-
-	tkey.dptr = (unsigned char *)PyString_AsString(key);
-	tkey.dsize = PyString_Size(key);
-
-	ret = tdb_fetch(self->ctx, tkey, &val);
-	if (ret == TDB_ERR_NOEXIST) {
-		PyErr_SetString(PyExc_KeyError, "No such TDB entry");
-		return NULL;
-	} else {
-		PyErr_TDB_ERROR_IS_ERR_RAISE(ret);
-		return PyString_FromTDB_DATA(val);
-	}
-}
-
-static int obj_setitem(PyTdbObject *self, PyObject *key, PyObject *value)
-{
-	TDB_DATA tkey, tval;
-	enum TDB_ERROR ret;
-	if (!PyString_Check(key)) {
-		PyErr_SetString(PyExc_TypeError, "Expected string as key");
-		return -1;
-	}
-
-	tkey = PyString_AsTDB_DATA(key);
-
-	if (value == NULL) {
-		ret = tdb_delete(self->ctx, tkey);
-	} else {
-		if (!PyString_Check(value)) {
-			PyErr_SetString(PyExc_TypeError, "Expected string as value");
-			return -1;
-		}
-
-		tval = PyString_AsTDB_DATA(value);
-
-		ret = tdb_store(self->ctx, tkey, tval, TDB_REPLACE);
-	}
-
-	if (ret != TDB_SUCCESS) {
-		PyErr_SetTDBError(ret);
-		return -1;
-	}
-
-	return ret;
-}
-
-static PyMappingMethods tdb_object_mapping = {
-	.mp_subscript = (binaryfunc)obj_getitem,
-	.mp_ass_subscript = (objobjargproc)obj_setitem,
-};
-static PyTypeObject PyTdb = {
-	.tp_name = "tdb.Tdb",
-	.tp_basicsize = sizeof(PyTdbObject),
-	.tp_methods = tdb_object_methods,
-	.tp_getset = tdb_object_getsetters,
-	.tp_new = py_tdb_open,
-	.tp_doc = "A TDB file",
-	.tp_repr = (reprfunc)tdb_object_repr,
-	.tp_dealloc = (destructor)tdb_object_dealloc,
-	.tp_as_mapping = &tdb_object_mapping,
-	.tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_ITER,
-	.tp_iter = (getiterfunc)tdb_object_iter,
-};
-
-static PyMethodDef tdb_methods[] = {
-	{ "open", (PyCFunction)py_tdb_open, METH_VARARGS|METH_KEYWORDS, "open(name, hash_size=0, tdb_flags=TDB_DEFAULT, flags=O_RDWR, mode=0600)\n"
-		"Open a TDB file." },
-	{ NULL }
-};
-
-void inittdb(void);
-void inittdb(void)
-{
-	PyObject *m;
-
-	if (PyType_Ready(&PyTdb) < 0)
-		return;
-
-	if (PyType_Ready(&PyTdbIterator) < 0)
-		return;
-
-	m = Py_InitModule3("tdb", tdb_methods, "TDB is a simple key-value database similar to GDBM that supports multiple writers.");
-	if (m == NULL)
-		return;
-
-	PyModule_AddObject(m, "REPLACE", PyInt_FromLong(TDB_REPLACE));
-	PyModule_AddObject(m, "INSERT", PyInt_FromLong(TDB_INSERT));
-	PyModule_AddObject(m, "MODIFY", PyInt_FromLong(TDB_MODIFY));
-
-	PyModule_AddObject(m, "DEFAULT", PyInt_FromLong(TDB_DEFAULT));
-	PyModule_AddObject(m, "INTERNAL", PyInt_FromLong(TDB_INTERNAL));
-	PyModule_AddObject(m, "NOLOCK", PyInt_FromLong(TDB_NOLOCK));
-	PyModule_AddObject(m, "NOMMAP", PyInt_FromLong(TDB_NOMMAP));
-	PyModule_AddObject(m, "CONVERT", PyInt_FromLong(TDB_CONVERT));
-	PyModule_AddObject(m, "NOSYNC", PyInt_FromLong(TDB_NOSYNC));
-	PyModule_AddObject(m, "SEQNUM", PyInt_FromLong(TDB_SEQNUM));
-	PyModule_AddObject(m, "ALLOW_NESTING", PyInt_FromLong(TDB_ALLOW_NESTING));
-
-	PyModule_AddObject(m, "__docformat__", PyString_FromString("restructuredText"));
-
-	PyModule_AddObject(m, "__version__", PyString_FromString(PACKAGE_VERSION));
-
-	Py_INCREF(&PyTdb);
-	PyModule_AddObject(m, "Tdb", (PyObject *)&PyTdb);
-
-	Py_INCREF(&PyTdbIterator);
-}
diff --git a/lib/tdb2/summary.c b/lib/tdb2/summary.c
deleted file mode 100644
index c7e93284e0..0000000000
--- a/lib/tdb2/summary.c
+++ /dev/null
@@ -1,330 +0,0 @@
- /*
-   Trivial Database 2: human-readable summary code
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <assert.h>
-#include <ccan/tally/tally.h>
-
-#define SUMMARY_FORMAT \
-	"Size of file/data: %zu/%zu\n" \
-	"Number of records: %zu\n" \
-	"Smallest/average/largest keys: %zu/%zu/%zu\n%s" \
-	"Smallest/average/largest data: %zu/%zu/%zu\n%s" \
-	"Smallest/average/largest padding: %zu/%zu/%zu\n%s" \
-	"Number of free records: %zu\n" \
-	"Smallest/average/largest free records: %zu/%zu/%zu\n%s" \
-	"Number of uncoalesced records: %zu\n" \
-	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
-	"Toplevel hash used: %u of %u\n" \
-	"Number of chains: %zu\n" \
-	"Number of subhashes: %zu\n" \
-	"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
-	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
-
-#define BUCKET_SUMMARY_FORMAT_A					\
-	"Free bucket %zu: total entries %zu.\n"			\
-	"Smallest/average/largest length: %zu/%zu/%zu\n%s"
-#define BUCKET_SUMMARY_FORMAT_B					\
-	"Free bucket %zu-%zu: total entries %zu.\n"		\
-	"Smallest/average/largest length: %zu/%zu/%zu\n%s"
-#define CAPABILITY_FORMAT					\
-	"Capability %llu%s\n"
-
-#define HISTO_WIDTH 70
-#define HISTO_HEIGHT 20
-
-static tdb_off_t count_hash(struct tdb_context *tdb,
-			    tdb_off_t hash_off, unsigned bits)
-{
-	const tdb_off_t *h;
-	tdb_off_t count = 0;
-	unsigned int i;
-
-	h = tdb_access_read(tdb, hash_off, sizeof(*h) << bits, true);
-	if (TDB_PTR_IS_ERR(h)) {
-		return TDB_ERR_TO_OFF(TDB_PTR_ERR(h));
-	}
-	for (i = 0; i < (1 << bits); i++)
-		count += (h[i] != 0);
-
-	tdb_access_release(tdb, h);
-	return count;
-}
-
-static enum TDB_ERROR summarize(struct tdb_context *tdb,
-				struct tally *hashes,
-				struct tally *ftables,
-				struct tally *fr,
-				struct tally *keys,
-				struct tally *data,
-				struct tally *extra,
-				struct tally *uncoal,
-				struct tally *chains,
-				size_t *num_caps)
-{
-	tdb_off_t off;
-	tdb_len_t len;
-	tdb_len_t unc = 0;
-
-	for (off = sizeof(struct tdb_header);
-	     off < tdb->file->map_size;
-	     off += len) {
-		const union {
-			struct tdb_used_record u;
-			struct tdb_free_record f;
-			struct tdb_recovery_record r;
-		} *p;
-		/* We might not be able to get the whole thing. */
-		p = tdb_access_read(tdb, off, sizeof(p->f), true);
-		if (TDB_PTR_IS_ERR(p)) {
-			return TDB_PTR_ERR(p);
-		}
-		if (frec_magic(&p->f) != TDB_FREE_MAGIC) {
-			if (unc > 1) {
-				tally_add(uncoal, unc);
-				unc = 0;
-			}
-		}
-
-		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
-		    || p->r.magic == TDB_RECOVERY_MAGIC) {
-			len = sizeof(p->r) + p->r.max_len;
-		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC) {
-			len = frec_len(&p->f);
-			tally_add(fr, len);
-			len += sizeof(p->u);
-			unc++;
-		} else if (rec_magic(&p->u) == TDB_USED_MAGIC) {
-			len = sizeof(p->u)
-				+ rec_key_length(&p->u)
-				+ rec_data_length(&p->u)
-				+ rec_extra_padding(&p->u);
-
-			tally_add(keys, rec_key_length(&p->u));
-			tally_add(data, rec_data_length(&p->u));
-			tally_add(extra, rec_extra_padding(&p->u));
-		} else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) {
-			tdb_off_t count = count_hash(tdb,
-						     off + sizeof(p->u),
-						     TDB_SUBLEVEL_HASH_BITS);
-			if (TDB_OFF_IS_ERR(count)) {
-				return TDB_OFF_TO_ERR(count);
-			}
-			tally_add(hashes, count);
-			tally_add(extra, rec_extra_padding(&p->u));
-			len = sizeof(p->u)
-				+ rec_data_length(&p->u)
-				+ rec_extra_padding(&p->u);
-		} else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) {
-			len = sizeof(p->u)
-				+ rec_data_length(&p->u)
-				+ rec_extra_padding(&p->u);
-			tally_add(ftables, rec_data_length(&p->u));
-			tally_add(extra, rec_extra_padding(&p->u));
-		} else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) {
-			len = sizeof(p->u)
-				+ rec_data_length(&p->u)
-				+ rec_extra_padding(&p->u);
-			tally_add(chains, 1);
-			tally_add(extra, rec_extra_padding(&p->u));
-		} else if (rec_magic(&p->u) == TDB_CAP_MAGIC) {
-			len = sizeof(p->u)
-				+ rec_data_length(&p->u)
-				+ rec_extra_padding(&p->u);
-			(*num_caps)++;
-		} else {
-			len = dead_space(tdb, off);
-			if (TDB_OFF_IS_ERR(len)) {
-				return TDB_OFF_TO_ERR(len);
-			}
-		}
-		tdb_access_release(tdb, p);
-	}
-	if (unc)
-		tally_add(uncoal, unc);
-	return TDB_SUCCESS;
-}
-
-static void add_capabilities(struct tdb_context *tdb, char *summary)
-{
-	tdb_off_t off, next;
-	const struct tdb_capability *cap;
-	size_t count = 0;
-
-	/* Append to summary. */
-	summary += strlen(summary);
-
-	off = tdb_read_off(tdb, offsetof(struct tdb_header, capabilities));
-	if (TDB_OFF_IS_ERR(off))
-		return;
-
-	/* Walk capability list. */
-	for (; off; off = next) {
-		cap = tdb_access_read(tdb, off, sizeof(*cap), true);
-		if (TDB_PTR_IS_ERR(cap)) {
-			break;
-		}
-		count++;
-		sprintf(summary, CAPABILITY_FORMAT,
-			cap->type & TDB_CAP_TYPE_MASK,
-			/* Noopen?  How did we get here? */
-			(cap->type & TDB_CAP_NOOPEN) ? " (unopenable)"
-			: ((cap->type & TDB_CAP_NOWRITE)
-			   && (cap->type & TDB_CAP_NOCHECK)) ? " (uncheckable,read-only)"
-			: (cap->type & TDB_CAP_NOWRITE) ? " (read-only)"
-			: (cap->type & TDB_CAP_NOCHECK) ? " (uncheckable)"
-			: "");
-		summary += strlen(summary);
-		next = cap->next;
-		tdb_access_release(tdb, cap);
-	}
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_summary(struct tdb_context *tdb,
-			   enum tdb_summary_flags flags,
-			   char **summary)
-{
-	tdb_len_t len;
-	size_t num_caps = 0;
-	struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
-		*chains;
-	char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg;
-	enum TDB_ERROR ecode;
-
-	hashesg = freeg = keysg = datag = extrag = uncoalg = NULL;
-
-	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
-	if (ecode != TDB_SUCCESS) {
-		return tdb->last_error = ecode;
-	}
-
-	ecode = tdb_lock_expand(tdb, F_RDLCK);
-	if (ecode != TDB_SUCCESS) {
-		tdb_allrecord_unlock(tdb, F_RDLCK);
-		return tdb->last_error = ecode;
-	}
-
-	/* Start stats off empty. */
-	ftables = tally_new(HISTO_HEIGHT);
-	hashes = tally_new(HISTO_HEIGHT);
-	freet = tally_new(HISTO_HEIGHT);
-	keys = tally_new(HISTO_HEIGHT);
-	data = tally_new(HISTO_HEIGHT);
-	extra = tally_new(HISTO_HEIGHT);
-	uncoal = tally_new(HISTO_HEIGHT);
-	chains = tally_new(HISTO_HEIGHT);
-	if (!ftables || !hashes || !freet || !keys || !data || !extra
-	    || !uncoal || !chains) {
-		ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-				   "tdb_summary: failed to allocate"
-				   " tally structures");
-		goto unlock;
-	}
-
-	ecode = summarize(tdb, hashes, ftables, freet, keys, data, extra,
-			  uncoal, chains, &num_caps);
-	if (ecode != TDB_SUCCESS) {
-		goto unlock;
-	}
-
-	if (flags & TDB_SUMMARY_HISTOGRAMS) {
-		hashesg = tally_histogram(hashes, HISTO_WIDTH, HISTO_HEIGHT);
-		freeg = tally_histogram(freet, HISTO_WIDTH, HISTO_HEIGHT);
-		keysg = tally_histogram(keys, HISTO_WIDTH, HISTO_HEIGHT);
-		datag = tally_histogram(data, HISTO_WIDTH, HISTO_HEIGHT);
-		extrag = tally_histogram(extra, HISTO_WIDTH, HISTO_HEIGHT);
-		uncoalg = tally_histogram(uncoal, HISTO_WIDTH, HISTO_HEIGHT);
-	}
-
-	/* 20 is max length of a %llu. */
-	len = strlen(SUMMARY_FORMAT) + 33*20 + 1
-		+ (hashesg ? strlen(hashesg) : 0)
-		+ (freeg ? strlen(freeg) : 0)
-		+ (keysg ? strlen(keysg) : 0)
-		+ (datag ? strlen(datag) : 0)
-		+ (extrag ? strlen(extrag) : 0)
-		+ (uncoalg ? strlen(uncoalg) : 0)
-		+ num_caps * (strlen(CAPABILITY_FORMAT) + 20
-			      + strlen(" (uncheckable,read-only)"));
-
-	*summary = malloc(len);
-	if (!*summary) {
-		ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-				   "tdb_summary: failed to allocate string");
-		goto unlock;
-	}
-
-	sprintf(*summary, SUMMARY_FORMAT,
-		(size_t)tdb->file->map_size,
-		tally_total(keys, NULL) + tally_total(data, NULL),
-		tally_num(keys),
-		tally_min(keys), tally_mean(keys), tally_max(keys),
-		keysg ? keysg : "",
-		tally_min(data), tally_mean(data), tally_max(data),
-		datag ? datag : "",
-		tally_min(extra), tally_mean(extra), tally_max(extra),
-		extrag ? extrag : "",
-		tally_num(freet),
-		tally_min(freet), tally_mean(freet), tally_max(freet),
-		freeg ? freeg : "",
-		tally_total(uncoal, NULL),
-		tally_min(uncoal), tally_mean(uncoal), tally_max(uncoal),
-		uncoalg ? uncoalg : "",
-		(unsigned)count_hash(tdb, offsetof(struct tdb_header,
-						   hashtable),
-				     TDB_TOPLEVEL_HASH_BITS),
-		1 << TDB_TOPLEVEL_HASH_BITS,
-		tally_num(chains),
-		tally_num(hashes),
-		tally_min(hashes), tally_mean(hashes), tally_max(hashes),
-		hashesg ? hashesg : "",
-		tally_total(keys, NULL) * 100.0 / tdb->file->map_size,
-		tally_total(data, NULL) * 100.0 / tdb->file->map_size,
-		tally_total(extra, NULL) * 100.0 / tdb->file->map_size,
-		tally_total(freet, NULL) * 100.0 / tdb->file->map_size,
-		(tally_num(keys) + tally_num(freet) + tally_num(hashes))
-		* sizeof(struct tdb_used_record) * 100.0 / tdb->file->map_size,
-		tally_num(ftables) * sizeof(struct tdb_freetable)
-		* 100.0 / tdb->file->map_size,
-		(tally_num(hashes)
-		 * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
-		 + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)
-		 + sizeof(struct tdb_chain) * tally_num(chains))
-		* 100.0 / tdb->file->map_size);
-
-	add_capabilities(tdb, *summary);
-
-unlock:
-	free(hashesg);
-	free(freeg);
-	free(keysg);
-	free(datag);
-	free(extrag);
-	free(uncoalg);
-	free(hashes);
-	free(freet);
-	free(keys);
-	free(data);
-	free(extra);
-	free(uncoal);
-	free(ftables);
-	free(chains);
-
-	tdb_allrecord_unlock(tdb, F_RDLCK);
-	tdb_unlock_expand(tdb, F_RDLCK);
-	return tdb->last_error = ecode;
-}
diff --git a/lib/tdb2/tdb.c b/lib/tdb2/tdb.c
deleted file mode 100644
index 5257aa17e3..0000000000
--- a/lib/tdb2/tdb.c
+++ /dev/null
@@ -1,605 +0,0 @@
- /*
-   Trivial Database 2: fetch, store and misc routines.
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#ifndef HAVE_LIBREPLACE
-#include <ccan/asprintf/asprintf.h>
-#include <stdarg.h>
-#endif
-
-static enum TDB_ERROR update_rec_hdr(struct tdb_context *tdb,
-				     tdb_off_t off,
-				     tdb_len_t keylen,
-				     tdb_len_t datalen,
-				     struct tdb_used_record *rec,
-				     uint64_t h)
-{
-	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
-	enum TDB_ERROR ecode;
-
-	ecode = set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen,
-			   keylen + dataroom, h);
-	if (ecode == TDB_SUCCESS) {
-		ecode = tdb_write_convert(tdb, off, rec, sizeof(*rec));
-	}
-	return ecode;
-}
-
-static enum TDB_ERROR replace_data(struct tdb_context *tdb,
-				   struct hash_info *h,
-				   struct tdb_data key, struct tdb_data dbuf,
-				   tdb_off_t old_off, tdb_len_t old_room,
-				   bool growing)
-{
-	tdb_off_t new_off;
-	enum TDB_ERROR ecode;
-
-	/* Allocate a new record. */
-	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC,
-			growing);
-	if (TDB_OFF_IS_ERR(new_off)) {
-		return TDB_OFF_TO_ERR(new_off);
-	}
-
-	/* We didn't like the existing one: remove it. */
-	if (old_off) {
-		tdb->stats.frees++;
-		ecode = add_free_record(tdb, old_off,
-					sizeof(struct tdb_used_record)
-					+ key.dsize + old_room,
-					TDB_LOCK_WAIT, true);
-		if (ecode == TDB_SUCCESS)
-			ecode = replace_in_hash(tdb, h, new_off);
-	} else {
-		ecode = add_to_hash(tdb, h, new_off);
-	}
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	new_off += sizeof(struct tdb_used_record);
-	ecode = tdb->io->twrite(tdb, new_off, key.dptr, key.dsize);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	new_off += key.dsize;
-	ecode = tdb->io->twrite(tdb, new_off, dbuf.dptr, dbuf.dsize);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	if (tdb->flags & TDB_SEQNUM)
-		tdb_inc_seqnum(tdb);
-
-	return TDB_SUCCESS;
-}
-
-static enum TDB_ERROR update_data(struct tdb_context *tdb,
-				  tdb_off_t off,
-				  struct tdb_data dbuf,
-				  tdb_len_t extra)
-{
-	enum TDB_ERROR ecode;
-
-	ecode = tdb->io->twrite(tdb, off, dbuf.dptr, dbuf.dsize);
-	if (ecode == TDB_SUCCESS && extra) {
-		/* Put a zero in; future versions may append other data. */
-		ecode = tdb->io->twrite(tdb, off + dbuf.dsize, "", 1);
-	}
-	if (tdb->flags & TDB_SEQNUM)
-		tdb_inc_seqnum(tdb);
-
-	return ecode;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_store(struct tdb_context *tdb,
-			 struct tdb_data key, struct tdb_data dbuf, int flag)
-{
-	struct hash_info h;
-	tdb_off_t off;
-	tdb_len_t old_room = 0;
-	struct tdb_used_record rec;
-	enum TDB_ERROR ecode;
-
-	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
-	if (TDB_OFF_IS_ERR(off)) {
-		return tdb->last_error = TDB_OFF_TO_ERR(off);
-	}
-
-	/* Now we have lock on this hash bucket. */
-	if (flag == TDB_INSERT) {
-		if (off) {
-			ecode = TDB_ERR_EXISTS;
-			goto out;
-		}
-	} else {
-		if (off) {
-			old_room = rec_data_length(&rec)
-				+ rec_extra_padding(&rec);
-			if (old_room >= dbuf.dsize) {
-				/* Can modify in-place.  Easy! */
-				ecode = update_rec_hdr(tdb, off,
-						       key.dsize, dbuf.dsize,
-						       &rec, h.h);
-				if (ecode != TDB_SUCCESS) {
-					goto out;
-				}
-				ecode = update_data(tdb,
-						    off + sizeof(rec)
-						    + key.dsize, dbuf,
-						    old_room - dbuf.dsize);
-				if (ecode != TDB_SUCCESS) {
-					goto out;
-				}
-				tdb_unlock_hashes(tdb, h.hlock_start,
-						  h.hlock_range, F_WRLCK);
-				return tdb->last_error = TDB_SUCCESS;
-			}
-		} else {
-			if (flag == TDB_MODIFY) {
-				/* if the record doesn't exist and we
-				   are in TDB_MODIFY mode then we should fail
-				   the store */
-				ecode = TDB_ERR_NOEXIST;
-				goto out;
-			}
-		}
-	}
-
-	/* If we didn't use the old record, this implies we're growing. */
-	ecode = replace_data(tdb, &h, key, dbuf, off, old_room, off);
-out:
-	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
-	return tdb->last_error = ecode;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_append(struct tdb_context *tdb,
-			  struct tdb_data key, struct tdb_data dbuf)
-{
-	struct hash_info h;
-	tdb_off_t off;
-	struct tdb_used_record rec;
-	tdb_len_t old_room = 0, old_dlen;
-	unsigned char *newdata;
-	struct tdb_data new_dbuf;
-	enum TDB_ERROR ecode;
-
-	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
-	if (TDB_OFF_IS_ERR(off)) {
-		return tdb->last_error = TDB_OFF_TO_ERR(off);
-	}
-
-	if (off) {
-		old_dlen = rec_data_length(&rec);
-		old_room = old_dlen + rec_extra_padding(&rec);
-
-		/* Fast path: can append in place. */
-		if (rec_extra_padding(&rec) >= dbuf.dsize) {
-			ecode = update_rec_hdr(tdb, off, key.dsize,
-					       old_dlen + dbuf.dsize, &rec,
-					       h.h);
-			if (ecode != TDB_SUCCESS) {
-				goto out;
-			}
-
-			off += sizeof(rec) + key.dsize + old_dlen;
-			ecode = update_data(tdb, off, dbuf,
-					    rec_extra_padding(&rec));
-			goto out;
-		}
-
-		/* Slow path. */
-		newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
-		if (!newdata) {
-			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-					   "tdb_append:"
-					   " failed to allocate %zu bytes",
-					   (size_t)(key.dsize + old_dlen
-						    + dbuf.dsize));
-			goto out;
-		}
-		ecode = tdb->io->tread(tdb, off + sizeof(rec) + key.dsize,
-				       newdata, old_dlen);
-		if (ecode != TDB_SUCCESS) {
-			goto out_free_newdata;
-		}
-		memcpy(newdata + old_dlen, dbuf.dptr, dbuf.dsize);
-		new_dbuf.dptr = newdata;
-		new_dbuf.dsize = old_dlen + dbuf.dsize;
-	} else {
-		newdata = NULL;
-		new_dbuf = dbuf;
-	}
-
-	/* If they're using tdb_append(), it implies they're growing record. */
-	ecode = replace_data(tdb, &h, key, new_dbuf, off, old_room, true);
-
-out_free_newdata:
-	free(newdata);
-out:
-	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
-	return tdb->last_error = ecode;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key,
-			 struct tdb_data *data)
-{
-	tdb_off_t off;
-	struct tdb_used_record rec;
-	struct hash_info h;
-	enum TDB_ERROR ecode;
-
-	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
-	if (TDB_OFF_IS_ERR(off)) {
-		return tdb->last_error = TDB_OFF_TO_ERR(off);
-	}
-
-	if (!off) {
-		ecode = TDB_ERR_NOEXIST;
-	} else {
-		data->dsize = rec_data_length(&rec);
-		data->dptr = tdb_alloc_read(tdb, off + sizeof(rec) + key.dsize,
-					    data->dsize);
-		if (TDB_PTR_IS_ERR(data->dptr)) {
-			ecode = TDB_PTR_ERR(data->dptr);
-		} else
-			ecode = TDB_SUCCESS;
-	}
-
-	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
-	return tdb->last_error = ecode;
-}
-
-_PUBLIC_ bool tdb_exists(struct tdb_context *tdb, TDB_DATA key)
-{
-	tdb_off_t off;
-	struct tdb_used_record rec;
-	struct hash_info h;
-
-	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
-	if (TDB_OFF_IS_ERR(off)) {
-		tdb->last_error = TDB_OFF_TO_ERR(off);
-		return false;
-	}
-	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
-
-	tdb->last_error = TDB_SUCCESS;
-	return off ? true : false;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key)
-{
-	tdb_off_t off;
-	struct tdb_used_record rec;
-	struct hash_info h;
-	enum TDB_ERROR ecode;
-
-	off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
-	if (TDB_OFF_IS_ERR(off)) {
-		return tdb->last_error = TDB_OFF_TO_ERR(off);
-	}
-
-	if (!off) {
-		ecode = TDB_ERR_NOEXIST;
-		goto unlock;
-	}
-
-	ecode = delete_from_hash(tdb, &h);
-	if (ecode != TDB_SUCCESS) {
-		goto unlock;
-	}
-
-	/* Free the deleted entry. */
-	tdb->stats.frees++;
-	ecode = add_free_record(tdb, off,
-				sizeof(struct tdb_used_record)
-				+ rec_key_length(&rec)
-				+ rec_data_length(&rec)
-				+ rec_extra_padding(&rec),
-				TDB_LOCK_WAIT, true);
-
-	if (tdb->flags & TDB_SEQNUM)
-		tdb_inc_seqnum(tdb);
-
-unlock:
-	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
-	return tdb->last_error = ecode;
-}
-
-_PUBLIC_ unsigned int tdb_get_flags(struct tdb_context *tdb)
-{
-	return tdb->flags;
-}
-
-static bool inside_transaction(const struct tdb_context *tdb)
-{
-	return tdb->transaction != NULL;
-}
-
-static bool readonly_changable(struct tdb_context *tdb, const char *caller)
-{
-	if (inside_transaction(tdb)) {
-		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
-					     TDB_LOG_USE_ERROR,
-					     "%s: can't change"
-					     " TDB_RDONLY inside transaction",
-					     caller);
-		return false;
-	}
-	return true;
-}
-
-_PUBLIC_ void tdb_add_flag(struct tdb_context *tdb, unsigned flag)
-{
-	if (tdb->flags & TDB_INTERNAL) {
-		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
-					     TDB_LOG_USE_ERROR,
-					     "tdb_add_flag: internal db");
-		return;
-	}
-	switch (flag) {
-	case TDB_NOLOCK:
-		tdb->flags |= TDB_NOLOCK;
-		break;
-	case TDB_NOMMAP:
-		tdb->flags |= TDB_NOMMAP;
-#ifndef HAVE_INCOHERENT_MMAP
-		tdb_munmap(tdb->file);
-#endif
-		break;
-	case TDB_NOSYNC:
-		tdb->flags |= TDB_NOSYNC;
-		break;
-	case TDB_SEQNUM:
-		tdb->flags |= TDB_SEQNUM;
-		break;
-	case TDB_ALLOW_NESTING:
-		tdb->flags |= TDB_ALLOW_NESTING;
-		break;
-	case TDB_RDONLY:
-		if (readonly_changable(tdb, "tdb_add_flag"))
-			tdb->flags |= TDB_RDONLY;
-		break;
-	default:
-		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
-					     TDB_LOG_USE_ERROR,
-					     "tdb_add_flag: Unknown flag %u",
-					     flag);
-	}
-}
-
-_PUBLIC_ void tdb_remove_flag(struct tdb_context *tdb, unsigned flag)
-{
-	if (tdb->flags & TDB_INTERNAL) {
-		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
-					     TDB_LOG_USE_ERROR,
-					     "tdb_remove_flag: internal db");
-		return;
-	}
-	switch (flag) {
-	case TDB_NOLOCK:
-		tdb->flags &= ~TDB_NOLOCK;
-		break;
-	case TDB_NOMMAP:
-		tdb->flags &= ~TDB_NOMMAP;
-#ifndef HAVE_INCOHERENT_MMAP
-		/* If mmap incoherent, we were mmaping anyway. */
-		tdb_mmap(tdb);
-#endif
-		break;
-	case TDB_NOSYNC:
-		tdb->flags &= ~TDB_NOSYNC;
-		break;
-	case TDB_SEQNUM:
-		tdb->flags &= ~TDB_SEQNUM;
-		break;
-	case TDB_ALLOW_NESTING:
-		tdb->flags &= ~TDB_ALLOW_NESTING;
-		break;
-	case TDB_RDONLY:
-		if ((tdb->open_flags & O_ACCMODE) == O_RDONLY) {
-			tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
-						     TDB_LOG_USE_ERROR,
-						     "tdb_remove_flag: can't"
-						     " remove TDB_RDONLY on tdb"
-						     " opened with O_RDONLY");
-			break;
-		}
-		if (readonly_changable(tdb, "tdb_remove_flag"))
-			tdb->flags &= ~TDB_RDONLY;
-		break;
-	default:
-		tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
-					     TDB_LOG_USE_ERROR,
-					     "tdb_remove_flag: Unknown flag %u",
-					     flag);
-	}
-}
-
-_PUBLIC_ const char *tdb_errorstr(enum TDB_ERROR ecode)
-{
-	/* Gcc warns if you miss a case in the switch, so use that. */
-	switch (TDB_ERR_TO_OFF(ecode)) {
-	case TDB_ERR_TO_OFF(TDB_SUCCESS): return "Success";
-	case TDB_ERR_TO_OFF(TDB_ERR_CORRUPT): return "Corrupt database";
-	case TDB_ERR_TO_OFF(TDB_ERR_IO): return "IO Error";
-	case TDB_ERR_TO_OFF(TDB_ERR_LOCK): return "Locking error";
-	case TDB_ERR_TO_OFF(TDB_ERR_OOM): return "Out of memory";
-	case TDB_ERR_TO_OFF(TDB_ERR_EXISTS): return "Record exists";
-	case TDB_ERR_TO_OFF(TDB_ERR_EINVAL): return "Invalid parameter";
-	case TDB_ERR_TO_OFF(TDB_ERR_NOEXIST): return "Record does not exist";
-	case TDB_ERR_TO_OFF(TDB_ERR_RDONLY): return "write not permitted";
-	}
-	return "Invalid error code";
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_error(struct tdb_context *tdb)
-{
-	return tdb->last_error;
-}
-
-enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb,
-			       enum TDB_ERROR ecode,
-			       enum tdb_log_level level,
-			       const char *fmt, ...)
-{
-	char *message;
-	va_list ap;
-	size_t len;
-	/* tdb_open paths care about errno, so save it. */
-	int saved_errno = errno;
-
-	if (!tdb->log_fn)
-		return ecode;
-
-	va_start(ap, fmt);
-	len = vasprintf(&message, fmt, ap);
-	va_end(ap);
-
-	if (len < 0) {
-		tdb->log_fn(tdb, TDB_LOG_ERROR, TDB_ERR_OOM,
-			    "out of memory formatting message:", tdb->log_data);
-		tdb->log_fn(tdb, level, ecode, fmt, tdb->log_data);
-	} else {
-		tdb->log_fn(tdb, level, ecode, message, tdb->log_data);
-		free(message);
-	}
-	errno = saved_errno;
-	return ecode;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb,
-				 TDB_DATA key,
-				 enum TDB_ERROR (*parse)(TDB_DATA k,
-							 TDB_DATA d,
-							 void *data),
-				 void *data)
-{
-	tdb_off_t off;
-	struct tdb_used_record rec;
-	struct hash_info h;
-	enum TDB_ERROR ecode;
-
-	off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
-	if (TDB_OFF_IS_ERR(off)) {
-		return tdb->last_error = TDB_OFF_TO_ERR(off);
-	}
-
-	if (!off) {
-		ecode = TDB_ERR_NOEXIST;
-	} else {
-		const void *dptr;
-		dptr = tdb_access_read(tdb, off + sizeof(rec) + key.dsize,
-				       rec_data_length(&rec), false);
-		if (TDB_PTR_IS_ERR(dptr)) {
-			ecode = TDB_PTR_ERR(dptr);
-		} else {
-			TDB_DATA d = tdb_mkdata(dptr, rec_data_length(&rec));
-
-			ecode = parse(key, d, data);
-			tdb_access_release(tdb, dptr);
-		}
-	}
-
-	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
-	return tdb->last_error = ecode;
-}
-
-_PUBLIC_ const char *tdb_name(const struct tdb_context *tdb)
-{
-	return tdb->name;
-}
-
-_PUBLIC_ int64_t tdb_get_seqnum(struct tdb_context *tdb)
-{
-	tdb_off_t off;
-
-	off = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
-	if (TDB_OFF_IS_ERR(off))
-		tdb->last_error = TDB_OFF_TO_ERR(off);
-	else
-		tdb->last_error = TDB_SUCCESS;
-	return off;
-}
-
-
-_PUBLIC_ int tdb_fd(const struct tdb_context *tdb)
-{
-	return tdb->file->fd;
-}
-
-struct traverse_state {
-	enum TDB_ERROR error;
-	struct tdb_context *dest_db;
-};
-
-/*
-  traverse function for repacking
- */
-static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
-			   struct traverse_state *state)
-{
-	state->error = tdb_store(state->dest_db, key, data, TDB_INSERT);
-	if (state->error != TDB_SUCCESS) {
-		return -1;
-	}
-	return 0;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_repack(struct tdb_context *tdb)
-{
-	struct tdb_context *tmp_db;
-	struct traverse_state state;
-
-	state.error = tdb_transaction_start(tdb);
-	if (state.error != TDB_SUCCESS) {
-		return state.error;
-	}
-
-	tmp_db = tdb_open("tmpdb", TDB_INTERNAL, O_RDWR|O_CREAT, 0, NULL);
-	if (tmp_db == NULL) {
-		state.error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-					 __location__
-					 " Failed to create tmp_db");
-		tdb_transaction_cancel(tdb);
-		return tdb->last_error = state.error;
-	}
-
-	state.dest_db = tmp_db;
-	if (tdb_traverse(tdb, repack_traverse, &state) < 0) {
-		goto fail;
-	}
-
-	state.error = tdb_wipe_all(tdb);
-	if (state.error != TDB_SUCCESS) {
-		goto fail;
-	}
-
-	state.dest_db = tdb;
-	if (tdb_traverse(tmp_db, repack_traverse, &state) < 0) {
-		goto fail;
-	}
-
-	tdb_close(tmp_db);
-	return tdb_transaction_commit(tdb);
-
-fail:
-	tdb_transaction_cancel(tdb);
-	tdb_close(tmp_db);
-	return state.error;
-}
diff --git a/lib/tdb2/tdb.pc.in b/lib/tdb2/tdb.pc.in
deleted file mode 100644
index 75e69d7363..0000000000
--- a/lib/tdb2/tdb.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: tdb
-Description: A trivial database
-Version: @PACKAGE_VERSION@
-Libs: @LIB_RPATH@ -L${libdir} -ltdb
-Cflags: -I${includedir}
-URL: http://tdb.samba.org/
diff --git a/lib/tdb2/tdb2.h b/lib/tdb2/tdb2.h
deleted file mode 100644
index f7aa0cc310..0000000000
--- a/lib/tdb2/tdb2.h
+++ /dev/null
@@ -1,897 +0,0 @@
-#ifndef CCAN_TDB2_H
-#define CCAN_TDB2_H
-
-/*
-   TDB version 2: trivial database library
-
-   Copyright (C) Andrew Tridgell 1999-2004
-   Copyright (C) Rusty Russell 2010-2011
-
-     ** NOTE! The following LGPL license applies to the tdb
-     ** library. This does NOT imply that all of Samba is released
-     ** under the LGPL
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#ifdef HAVE_LIBREPLACE
-#include <replace.h>
-#else
-#if HAVE_FILE_OFFSET_BITS
-#define _FILE_OFFSET_BITS 64
-#endif
-/* For mode_t */
-#include <sys/types.h>
-/* For O_* flags. */
-#include <sys/stat.h>
-/* For sig_atomic_t. */
-#include <signal.h>
-/* For uint64_t */
-#include <stdint.h>
-/* For bool */
-#include <stdbool.h>
-/* For memcmp */
-#include <string.h>
-#endif
-
-#if HAVE_CCAN
-#include <ccan/compiler/compiler.h>
-#include <ccan/typesafe_cb/typesafe_cb.h>
-#include <ccan/cast/cast.h>
-#else
-#ifndef typesafe_cb_preargs
-/* Failing to have CCAN just mean less typesafe protection, etc. */
-#define typesafe_cb_preargs(rtype, atype, fn, arg, ...)	\
-	((rtype (*)(__VA_ARGS__, atype))(fn))
-#endif
-#ifndef cast_const
-#if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
-#define cast_const(type, expr) ((type)((intptr_t)(expr)))
-#else
-#define cast_const(type, expr) ((type *)(expr))
-#endif
-#endif
-#endif /* !HAVE_CCAN */
-
-union tdb_attribute;
-struct tdb_context;
-
-/**
- * tdb_open - open a database file
- * @name: the file name (can be NULL if flags contains TDB_INTERNAL)
- * @tdb_flags: options for this database
- * @open_flags: flags argument for tdb's open() call.
- * @mode: mode argument for tdb's open() call.
- * @attributes: linked list of extra attributes for this tdb.
- *
- * This call opens (and potentially creates) a database file.
- * Multiple processes can have the TDB file open at once.
- *
- * On failure it will return NULL, and set errno: it may also call
- * any log attribute found in @attributes.
- *
- * See also:
- *	union tdb_attribute
- */
-struct tdb_context *tdb_open(const char *name, int tdb_flags,
-			     int open_flags, mode_t mode,
-			     union tdb_attribute *attributes);
-
-
-/* flags for tdb_open() */
-#define TDB_DEFAULT 0 /* just a readability place holder */
-#define TDB_INTERNAL 2 /* don't store on disk */
-#define TDB_NOLOCK   4 /* don't do any locking */
-#define TDB_NOMMAP   8 /* don't use mmap */
-#define TDB_CONVERT 16 /* convert endian */
-#define TDB_NOSYNC   64 /* don't use synchronous transactions */
-#define TDB_SEQNUM   128 /* maintain a sequence number */
-#define TDB_ALLOW_NESTING   256 /* fake nested transactions */
-#define TDB_RDONLY   512 /* implied by O_RDONLY */
-#define TDB_CANT_CHECK  2048 /* has a feature which we don't understand */
-
-/**
- * tdb_close - close and free a tdb.
- * @tdb: the tdb context returned from tdb_open()
- *
- * This always succeeds, in that @tdb is unusable after this call.  But if
- * some unexpected error occurred while closing, it will return non-zero
- * (the only clue as to cause will be via the log attribute).
- */
-int tdb_close(struct tdb_context *tdb);
-
-/**
- * struct tdb_data - representation of keys or values.
- * @dptr: the data pointer
- * @dsize: the size of the data pointed to by dptr.
- *
- * This is the "blob" representation of keys and data used by TDB.
- */
-typedef struct tdb_data {
-	unsigned char *dptr;
-	size_t dsize;
-} TDB_DATA;
-
-/**
- * enum TDB_ERROR - error returns for TDB
- *
- * See Also:
- *	tdb_errorstr()
- */
-enum TDB_ERROR {
-	TDB_SUCCESS	= 0,	/* No error. */
-	TDB_ERR_CORRUPT = -1,	/* We read the db, and it was bogus. */
-	TDB_ERR_IO	= -2,	/* We couldn't read/write the db. */
-	TDB_ERR_LOCK	= -3,	/* Locking failed. */
-	TDB_ERR_OOM	= -4,	/* Out of Memory. */
-	TDB_ERR_EXISTS	= -5,	/* The key already exists. */
-	TDB_ERR_NOEXIST	= -6,	/* The key does not exist. */
-	TDB_ERR_EINVAL	= -7,	/* You're using it wrong. */
-	TDB_ERR_RDONLY	= -8,	/* The database is read-only. */
-	TDB_ERR_LAST = TDB_ERR_RDONLY
-};
-
-/**
- * tdb_store - store a key/value pair in a tdb.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key
- * @dbuf: the data to associate with the key.
- * @flag: TDB_REPLACE, TDB_INSERT or TDB_MODIFY.
- *
- * This inserts (or overwrites) a key/value pair in the TDB.  If flag
- * is TDB_REPLACE, it doesn't matter whether the key exists or not;
- * TDB_INSERT means it must not exist (returns TDB_ERR_EXISTS otherwise),
- * and TDB_MODIFY means it must exist (returns TDB_ERR_NOEXIST otherwise).
- *
- * On success, this returns TDB_SUCCESS.
- *
- * See also:
- *	tdb_fetch, tdb_transaction_start, tdb_append, tdb_delete.
- */
-enum TDB_ERROR tdb_store(struct tdb_context *tdb,
-			 struct tdb_data key,
-			 struct tdb_data dbuf,
-			 int flag);
-
-/* flags to tdb_store() */
-#define TDB_REPLACE 1		/* A readability place holder */
-#define TDB_INSERT 2 		/* Don't overwrite an existing entry */
-#define TDB_MODIFY 3		/* Don't create an existing entry    */
-
-/**
- * tdb_fetch - fetch a value from a tdb.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key
- * @data: pointer to data.
- *
- * This looks up a key in the database and sets it in @data.
- *
- * If it returns TDB_SUCCESS, the key was found: it is your
- * responsibility to call free() on @data->dptr.
- *
- * Otherwise, it returns an error (usually, TDB_ERR_NOEXIST) and @data is
- * undefined.
- */
-enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key,
-			 struct tdb_data *data);
-
-/**
- * tdb_errorstr - map the tdb error onto a constant readable string
- * @ecode: the enum TDB_ERROR to map.
- *
- * This is useful for displaying errors to users.
- */
-const char *tdb_errorstr(enum TDB_ERROR ecode);
-
-/**
- * tdb_append - append a value to a key/value pair in a tdb.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key
- * @dbuf: the data to append.
- *
- * This is equivalent to fetching a record, reallocating .dptr to add the
- * data, and writing it back, only it's much more efficient.  If the key
- * doesn't exist, it's equivalent to tdb_store (with an additional hint that
- * you expect to expand the record in future).
- *
- * See Also:
- *	tdb_fetch(), tdb_store()
- */
-enum TDB_ERROR tdb_append(struct tdb_context *tdb,
-			  struct tdb_data key, struct tdb_data dbuf);
-
-/**
- * tdb_delete - delete a key from a tdb.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to delete.
- *
- * Returns TDB_SUCCESS on success, or an error (usually TDB_ERR_NOEXIST).
- *
- * See Also:
- *	tdb_fetch(), tdb_store()
- */
-enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key);
-
-/**
- * tdb_exists - does a key exist in the database?
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to search for.
- *
- * Returns true if it exists, or false if it doesn't or any other error.
- */
-bool tdb_exists(struct tdb_context *tdb, TDB_DATA key);
-
-/**
- * tdb_deq - are struct tdb_data equal?
- * @a: one struct tdb_data
- * @b: another struct tdb_data
- */
-static inline bool tdb_deq(struct tdb_data a, struct tdb_data b)
-{
-	return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0;
-}
-
-/**
- * tdb_mkdata - make a struct tdb_data from const data
- * @p: the constant pointer
- * @len: the length
- *
- * As the dptr member of struct tdb_data is not constant, you need to
- * cast it.  This function keeps thost casts in one place, as well as
- * suppressing the warning some compilers give when casting away a
- * qualifier (eg. gcc with -Wcast-qual)
- */
-static inline struct tdb_data tdb_mkdata(const void *p, size_t len)
-{
-	struct tdb_data d;
-	d.dptr = cast_const(void *, p);
-	d.dsize = len;
-	return d;
-}
-
-/**
- * tdb_transaction_start - start a transaction
- * @tdb: the tdb context returned from tdb_open()
- *
- * This begins a series of atomic operations.  Other processes will be able
- * to read the tdb, but not alter it (they will block), nor will they see
- * any changes until tdb_transaction_commit() is called.
- *
- * Note that if the TDB_ALLOW_NESTING flag is set, a tdb_transaction_start()
- * within a transaction will succeed, but it's not a real transaction:
- * (1) An inner transaction which is committed is not actually committed until
- *     the outer transaction is; if the outer transaction is cancelled, the
- *     inner ones are discarded.
- * (2) tdb_transaction_cancel() marks the outer transaction as having an error,
- *     so the final tdb_transaction_commit() will fail.
- * (3) the outer transaction will see the results of the inner transaction.
- *
- * See Also:
- *	tdb_transaction_cancel, tdb_transaction_commit.
- */
-enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb);
-
-/**
- * tdb_transaction_cancel - abandon a transaction
- * @tdb: the tdb context returned from tdb_open()
- *
- * This aborts a transaction, discarding any changes which were made.
- * tdb_close() does this implicitly.
- */
-void tdb_transaction_cancel(struct tdb_context *tdb);
-
-/**
- * tdb_transaction_commit - commit a transaction
- * @tdb: the tdb context returned from tdb_open()
- *
- * This completes a transaction, writing any changes which were made.
- *
- * fsync() is used to commit the transaction (unless TDB_NOSYNC is set),
- * making it robust against machine crashes, but very slow compared to
- * other TDB operations.
- *
- * A failure can only be caused by unexpected errors (eg. I/O or
- * memory); this is no point looping on transaction failure.
- *
- * See Also:
- *	tdb_transaction_prepare_commit()
- */
-enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb);
-
-/**
- * tdb_transaction_prepare_commit - prepare to commit a transaction
- * @tdb: the tdb context returned from tdb_open()
- *
- * This ensures we have the resources to commit a transaction (using
- * tdb_transaction_commit): if this succeeds then a transaction will only
- * fail if the write() or fsync() calls fail.
- *
- * If this fails you must still call tdb_transaction_cancel() to cancel
- * the transaction.
- *
- * See Also:
- *	tdb_transaction_commit()
- */
-enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb);
-
-/**
- * tdb_traverse - traverse a TDB
- * @tdb: the tdb context returned from tdb_open()
- * @fn: the function to call for every key/value pair (or NULL)
- * @p: the pointer to hand to @f
- *
- * This walks the TDB until all they keys have been traversed, or @fn
- * returns non-zero.  If the traverse function or other processes are
- * changing data or adding or deleting keys, the traverse may be
- * unreliable: keys may be skipped or (rarely) visited twice.
- *
- * There is one specific exception: the special case of deleting the
- * current key does not undermine the reliability of the traversal.
- *
- * On success, returns the number of keys iterated.  On error returns
- * a negative enum TDB_ERROR value.
- */
-#define tdb_traverse(tdb, fn, p)					\
-	tdb_traverse_(tdb, typesafe_cb_preargs(int, void *, (fn), (p),	\
-					       struct tdb_context *,	\
-					       TDB_DATA, TDB_DATA), (p))
-
-int64_t tdb_traverse_(struct tdb_context *tdb,
-		      int (*fn)(struct tdb_context *,
-				TDB_DATA, TDB_DATA, void *), void *p);
-
-/**
- * tdb_parse_record - operate directly on data in the database.
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key whose record we should hand to @parse
- * @parse: the function to call for the data
- * @data: the private pointer to hand to @parse (types must match).
- *
- * This avoids a copy for many cases, by handing you a pointer into
- * the memory-mapped database.  It also locks the record to prevent
- * other accesses at the same time.
- *
- * Do not alter the data handed to parse()!
- */
-#define tdb_parse_record(tdb, key, parse, data)				\
-	tdb_parse_record_((tdb), (key),					\
-			  typesafe_cb_preargs(enum TDB_ERROR, void *,	\
-					      (parse), (data),		\
-					      TDB_DATA, TDB_DATA), (data))
-
-enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb,
-				 TDB_DATA key,
-				 enum TDB_ERROR (*parse)(TDB_DATA k,
-							 TDB_DATA d,
-							 void *data),
-				 void *data);
-
-/**
- * tdb_get_seqnum - get a database sequence number
- * @tdb: the tdb context returned from tdb_open()
- *
- * This returns a sequence number: any change to the database from a
- * tdb context opened with the TDB_SEQNUM flag will cause that number
- * to increment.  Note that the incrementing is unreliable (it is done
- * without locking), so this is only useful as an optimization.
- *
- * For example, you may have a regular database backup routine which
- * does not operate if the sequence number is unchanged.  In the
- * unlikely event of a failed increment, it will be backed up next
- * time any way.
- *
- * Returns an enum TDB_ERROR (ie. negative) on error.
- */
-int64_t tdb_get_seqnum(struct tdb_context *tdb);
-
-/**
- * tdb_firstkey - get the "first" key in a TDB
- * @tdb: the tdb context returned from tdb_open()
- * @key: pointer to key.
- *
- * This returns an arbitrary key in the database; with tdb_nextkey() it allows
- * open-coded traversal of the database, though it is slightly less efficient
- * than tdb_traverse.
- *
- * It is your responsibility to free @key->dptr on success.
- *
- * Returns TDB_ERR_NOEXIST if the database is empty.
- */
-enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key);
-
-/**
- * tdb_nextkey - get the "next" key in a TDB
- * @tdb: the tdb context returned from tdb_open()
- * @key: a key returned by tdb_firstkey() or tdb_nextkey().
- *
- * This returns another key in the database; it will free @key.dptr for
- * your convenience.
- *
- * Returns TDB_ERR_NOEXIST if there are no more keys.
- */
-enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key);
-
-/**
- * tdb_chainlock - lock a record in the TDB
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to lock.
- *
- * This prevents any access occurring to a group of keys including @key,
- * even if @key does not exist.  This allows primitive atomic updates of
- * records without using transactions.
- *
- * You cannot begin a transaction while holding a tdb_chainlock(), nor can
- * you do any operations on any other keys in the database.  This also means
- * that you cannot hold more than one tdb_chainlock() at a time.
- *
- * See Also:
- *	tdb_chainunlock()
- */
-enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key);
-
-/**
- * tdb_chainunlock - unlock a record in the TDB
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to unlock.
- *
- * The key must have previously been locked by tdb_chainlock().
- */
-void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key);
-
-/**
- * tdb_chainlock_read - lock a record in the TDB, for reading
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to lock.
- *
- * This prevents any changes from occurring to a group of keys including @key,
- * even if @key does not exist.  This allows primitive atomic updates of
- * records without using transactions.
- *
- * You cannot begin a transaction while holding a tdb_chainlock_read(), nor can
- * you do any operations on any other keys in the database.  This also means
- * that you cannot hold more than one tdb_chainlock()/read() at a time.
- *
- * See Also:
- *	tdb_chainlock()
- */
-enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key);
-
-/**
- * tdb_chainunlock_read - unlock a record in the TDB for reading
- * @tdb: the tdb context returned from tdb_open()
- * @key: the key to unlock.
- *
- * The key must have previously been locked by tdb_chainlock_read().
- */
-void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key);
-
-/**
- * tdb_lockall - lock the entire TDB
- * @tdb: the tdb context returned from tdb_open()
- *
- * You cannot hold a tdb_chainlock while calling this.  It nests, so you
- * must call tdb_unlockall as many times as you call tdb_lockall.
- */
-enum TDB_ERROR tdb_lockall(struct tdb_context *tdb);
-
-/**
- * tdb_unlockall - unlock the entire TDB
- * @tdb: the tdb context returned from tdb_open()
- */
-void tdb_unlockall(struct tdb_context *tdb);
-
-/**
- * tdb_lockall_read - lock the entire TDB for reading
- * @tdb: the tdb context returned from tdb_open()
- *
- * This prevents others writing to the database, eg. tdb_delete, tdb_store,
- * tdb_append, but not tdb_fetch.
- *
- * You cannot hold a tdb_chainlock while calling this.  It nests, so you
- * must call tdb_unlockall_read as many times as you call tdb_lockall_read.
- */
-enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb);
-
-/**
- * tdb_unlockall_read - unlock the entire TDB for reading
- * @tdb: the tdb context returned from tdb_open()
- */
-void tdb_unlockall_read(struct tdb_context *tdb);
-
-/**
- * tdb_wipe_all - wipe the database clean
- * @tdb: the tdb context returned from tdb_open()
- *
- * Completely erase the database.  This is faster than iterating through
- * each key and doing tdb_delete.
- */
-enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb);
-
-/**
- * tdb_repack - repack the database
- * @tdb: the tdb context returned from tdb_open()
- *
- * This repacks the database; if it is suffering from a great deal of
- * fragmentation this might help.  However, it can take twice the
- * memory of the existing TDB.
- */
-enum TDB_ERROR tdb_repack(struct tdb_context *tdb);
-
-/**
- * tdb_check - check a TDB for consistency
- * @tdb: the tdb context returned from tdb_open()
- * @check: function to check each key/data pair (or NULL)
- * @data: argument for @check, must match type.
- *
- * This performs a consistency check of the open database, optionally calling
- * a check() function on each record so you can do your own data consistency
- * checks as well.  If check() returns an error, that is returned from
- * tdb_check().
- *
- * Note that the TDB uses a feature which we don't understand which
- * indicates we can't run tdb_check(), this will log a warning to that
- * effect and return TDB_SUCCESS.  You can detect this condition by
- * looking for TDB_CANT_CHECK in tdb_get_flags().
- *
- * Returns TDB_SUCCESS or an error.
- */
-#define tdb_check(tdb, check, data)					\
-	tdb_check_((tdb), typesafe_cb_preargs(enum TDB_ERROR, void *,	\
-					      (check), (data),		\
-					      struct tdb_data,		\
-					      struct tdb_data),		\
-		   (data))
-
-enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
-			  enum TDB_ERROR (*check)(struct tdb_data k,
-						  struct tdb_data d,
-						  void *data),
-			  void *data);
-
-/**
- * tdb_error - get the last error (not threadsafe)
- * @tdb: the tdb context returned from tdb_open()
- *
- * Returns the last error returned by a TDB function.
- *
- * This makes porting from TDB1 easier, but note that the last error is not
- * reliable in threaded programs.
- */
-enum TDB_ERROR tdb_error(struct tdb_context *tdb);
-
-/**
- * enum tdb_summary_flags - flags for tdb_summary.
- */
-enum tdb_summary_flags {
-	TDB_SUMMARY_HISTOGRAMS = 1 /* Draw graphs in the summary. */
-};
-
-/**
- * tdb_summary - return a string describing the TDB state
- * @tdb: the tdb context returned from tdb_open()
- * @flags: flags to control the summary output.
- * @summary: pointer to string to allocate.
- *
- * This returns a developer-readable string describing the overall
- * state of the tdb, such as the percentage used and sizes of records.
- * It is designed to provide information about the tdb at a glance
- * without displaying any keys or data in the database.
- *
- * On success, sets @summary to point to a malloc()'ed nul-terminated
- * multi-line string.  It is your responsibility to free() it.
- */
-enum TDB_ERROR tdb_summary(struct tdb_context *tdb,
-			   enum tdb_summary_flags flags,
-			   char **summary);
-
-
-/**
- * tdb_get_flags - return the flags for a tdb
- * @tdb: the tdb context returned from tdb_open()
- *
- * This returns the flags on the current tdb.  Some of these are caused by
- * the flags argument to tdb_open(), others (such as TDB_CONVERT) are
- * intuited.
- */
-unsigned int tdb_get_flags(struct tdb_context *tdb);
-
-/**
- * tdb_add_flag - set a flag for a tdb
- * @tdb: the tdb context returned from tdb_open()
- * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING.
- *
- * You can use this to set a flag on the TDB.  You cannot set these flags
- * on a TDB_INTERNAL tdb.
- */
-void tdb_add_flag(struct tdb_context *tdb, unsigned flag);
-
-/**
- * tdb_remove_flag - unset a flag for a tdb
- * @tdb: the tdb context returned from tdb_open()
- * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING.
- *
- * You can use this to clear a flag on the TDB.  You cannot clear flags
- * on a TDB_INTERNAL tdb.
- */
-void tdb_remove_flag(struct tdb_context *tdb, unsigned flag);
-
-/**
- * enum tdb_attribute_type - descriminator for union tdb_attribute.
- */
-enum tdb_attribute_type {
-	TDB_ATTRIBUTE_LOG = 0,
-	TDB_ATTRIBUTE_HASH = 1,
-	TDB_ATTRIBUTE_SEED = 2,
-	TDB_ATTRIBUTE_STATS = 3,
-	TDB_ATTRIBUTE_OPENHOOK = 4,
-	TDB_ATTRIBUTE_FLOCK = 5,
-};
-
-/**
- * tdb_get_attribute - get an attribute for an existing tdb
- * @tdb: the tdb context returned from tdb_open()
- * @attr: the union tdb_attribute to set.
- *
- * This gets an attribute from a TDB which has previously been set (or
- * may return the default values).  Set @attr.base.attr to the
- * attribute type you want get.
- */
-enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb,
-				 union tdb_attribute *attr);
-
-/**
- * tdb_set_attribute - set an attribute for an existing tdb
- * @tdb: the tdb context returned from tdb_open()
- * @attr: the union tdb_attribute to set.
- *
- * This sets an attribute on a TDB, overriding any previous attribute
- * of the same type.  It returns TDB_ERR_EINVAL if the attribute is
- * unknown or invalid.
- *
- * Note that TDB_ATTRIBUTE_HASH, TDB_ATTRIBUTE_SEED, and
- * TDB_ATTRIBUTE_OPENHOOK cannot currently be set after tdb_open.
- */
-enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb,
-				 const union tdb_attribute *attr);
-
-/**
- * tdb_unset_attribute - reset an attribute for an existing tdb
- * @tdb: the tdb context returned from tdb_open()
- * @type: the attribute type to unset.
- *
- * This unsets an attribute on a TDB, returning it to the defaults
- * (where applicable).
- *
- * Note that it only makes sense for TDB_ATTRIBUTE_LOG and TDB_ATTRIBUTE_FLOCK
- * to be unset.
- */
-void tdb_unset_attribute(struct tdb_context *tdb,
-			 enum tdb_attribute_type type);
-
-/**
- * tdb_name - get the name of a tdb
- * @tdb: the tdb context returned from tdb_open()
- *
- * This returns a copy of the name string, made at tdb_open() time.  If that
- * argument was NULL (possible for a TDB_INTERNAL db) this will return NULL.
- *
- * This is mostly useful for logging.
- */
-const char *tdb_name(const struct tdb_context *tdb);
-
-/**
- * tdb_fd - get the file descriptor of a tdb
- * @tdb: the tdb context returned from tdb_open()
- *
- * This returns the file descriptor for the underlying database file, or -1
- * for TDB_INTERNAL.
- */
-int tdb_fd(const struct tdb_context *tdb);
-
-/**
- * tdb_foreach - iterate through every open TDB.
- * @fn: the function to call for every TDB
- * @p: the pointer to hand to @fn
- *
- * TDB internally keeps track of all open TDBs; this function allows you to
- * iterate through them.  If @fn returns non-zero, traversal stops.
- */
-#define tdb_foreach(fn, p)						\
-	tdb_foreach_(typesafe_cb_preargs(int, void *, (fn), (p),	\
-					 struct tdb_context *), (p))
-
-void tdb_foreach_(int (*fn)(struct tdb_context *, void *), void *p);
-
-/**
- * struct tdb_attribute_base - common fields for all tdb attributes.
- */
-struct tdb_attribute_base {
-	enum tdb_attribute_type attr;
-	union tdb_attribute *next;
-};
-
-/**
- * enum tdb_log_level - log levels for tdb_attribute_log
- * @TDB_LOG_ERROR: used to log unrecoverable errors such as I/O errors
- *		   or internal consistency failures.
- * @TDB_LOG_USE_ERROR: used to log usage errors such as invalid parameters
- *		   or writing to a read-only database.
- * @TDB_LOG_WARNING: used for informational messages on issues which
- *		     are unusual but handled by TDB internally, such
- *		     as a failure to mmap or failure to open /dev/urandom.
- */
-enum tdb_log_level {
-	TDB_LOG_ERROR,
-	TDB_LOG_USE_ERROR,
-	TDB_LOG_WARNING
-};
-
-/**
- * struct tdb_attribute_log - log function attribute
- *
- * This attribute provides a hook for you to log errors.
- */
-struct tdb_attribute_log {
-	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
-	void (*fn)(struct tdb_context *tdb,
-		   enum tdb_log_level level,
-		   enum TDB_ERROR ecode,
-		   const char *message,
-		   void *data);
-	void *data;
-};
-
-/**
- * struct tdb_attribute_hash - hash function attribute
- *
- * This attribute allows you to provide an alternative hash function.
- * This hash function will be handed keys from the database; it will also
- * be handed the 8-byte TDB_HASH_MAGIC value for checking the header (the
- * tdb_open() will fail if the hash value doesn't match the header).
- *
- * Note that if your hash function gives different results on
- * different machine endians, your tdb will no longer work across
- * different architectures!
- */
-struct tdb_attribute_hash {
-	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
-	uint64_t (*fn)(const void *key, size_t len, uint64_t seed,
-		       void *data);
-	void *data;
-};
-
-/**
- * struct tdb_attribute_seed - hash function seed attribute
- *
- * The hash function seed is normally taken from /dev/urandom (or equivalent)
- * but can be set manually here.  This is mainly for testing purposes.
- */
-struct tdb_attribute_seed {
-	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_SEED */
-	uint64_t seed;
-};
-
-/**
- * struct tdb_attribute_stats - tdb operational statistics
- *
- * This attribute records statistics of various low-level TDB operations.
- * This can be used to assist performance evaluation.  This is only
- * useful for tdb_get_attribute().
- *
- * New fields will be added at the end, hence the "size" argument which
- * indicates how large your structure is: it must be filled in before
- * calling tdb_get_attribute(), which will overwrite it with the size
- * tdb knows about.
- */
-struct tdb_attribute_stats {
-	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */
-	size_t size; /* = sizeof(struct tdb_attribute_stats) */
-	uint64_t allocs;
-	uint64_t   alloc_subhash;
-	uint64_t   alloc_chain;
-	uint64_t   alloc_bucket_exact;
-	uint64_t   alloc_bucket_max;
-	uint64_t   alloc_leftover;
-	uint64_t   alloc_coalesce_tried;
-	uint64_t     alloc_coalesce_iterate_clash;
-	uint64_t     alloc_coalesce_lockfail;
-	uint64_t     alloc_coalesce_race;
-	uint64_t     alloc_coalesce_succeeded;
-	uint64_t       alloc_coalesce_num_merged;
-	uint64_t compares;
-	uint64_t   compare_wrong_bucket;
-	uint64_t   compare_wrong_offsetbits;
-	uint64_t   compare_wrong_keylen;
-	uint64_t   compare_wrong_rechash;
-	uint64_t   compare_wrong_keycmp;
-	uint64_t transactions;
-	uint64_t   transaction_cancel;
-	uint64_t   transaction_nest;
-	uint64_t   transaction_expand_file;
-	uint64_t   transaction_read_direct;
-	uint64_t      transaction_read_direct_fail;
-	uint64_t   transaction_write_direct;
-	uint64_t      transaction_write_direct_fail;
-	uint64_t expands;
-	uint64_t frees;
-	uint64_t locks;
-	uint64_t   lock_lowlevel;
-	uint64_t   lock_nonblock;
-	uint64_t     lock_nonblock_fail;
-};
-
-/**
- * struct tdb_attribute_openhook - tdb special effects hook for open
- *
- * This attribute contains a function to call once we have the OPEN_LOCK
- * for the tdb, but before we've examined its contents.  If this succeeds,
- * the tdb will be populated if it's then zero-length.
- *
- * This is a hack to allow support for TDB1-style TDB_CLEAR_IF_FIRST
- * behaviour.
- */
-struct tdb_attribute_openhook {
-	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_OPENHOOK */
-	enum TDB_ERROR (*fn)(int fd, void *data);
-	void *data;
-};
-
-/**
- * struct tdb_attribute_flock - tdb special effects hook for file locking
- *
- * This attribute contains function to call to place locks on a file; it can
- * be used to support non-blocking operations or lock proxying.
- *
- * They should return 0 on success, -1 on failure and set errno.
- *
- * An error will be logged on error if errno is neither EAGAIN nor EINTR
- * (normally it would only return EAGAIN if waitflag is false, and
- * loop internally on EINTR).
- */
-struct tdb_attribute_flock {
-	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_FLOCK */
-	int (*lock)(int fd,int rw, off_t off, off_t len, bool waitflag, void *);
-	int (*unlock)(int fd, int rw, off_t off, off_t len, void *);
-	void *data;
-};
-
-/**
- * union tdb_attribute - tdb attributes.
- *
- * This represents all the known attributes.
- *
- * See also:
- *	struct tdb_attribute_log, struct tdb_attribute_hash,
- *	struct tdb_attribute_seed, struct tdb_attribute_stats,
- *	struct tdb_attribute_openhook, struct tdb_attribute_flock.
- */
-union tdb_attribute {
-	struct tdb_attribute_base base;
-	struct tdb_attribute_log log;
-	struct tdb_attribute_hash hash;
-	struct tdb_attribute_seed seed;
-	struct tdb_attribute_stats stats;
-	struct tdb_attribute_openhook openhook;
-	struct tdb_attribute_flock flock;
-};
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif /* tdb2.h */
diff --git a/lib/tdb2/test/api-12-store.c b/lib/tdb2/test/api-12-store.c
deleted file mode 100644
index 6a9dd95f5f..0000000000
--- a/lib/tdb2/test/api-12-store.c
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <ccan/hash/hash.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-#include "logging.h"
-
-/* We use the same seed which we saw a failure on. */
-static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
-{
-	return hash64_stable((const unsigned char *)key, len,
-			     *(uint64_t *)p);
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	uint64_t seed = 16014841315512641303ULL;
-	union tdb_attribute fixed_hattr
-		= { .hash = { .base = { TDB_ATTRIBUTE_HASH },
-			      .fn = fixedhash,
-			      .data = &seed } };
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
-	struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
-
-	fixed_hattr.base.next = &tap_log_attr;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 500 * 3) + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-12-store.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		/* We seemed to lose some keys.
-		 * Insert and check they're in there! */
-		for (j = 0; j < 500; j++) {
-			struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
-			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
-			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-			ok1(tdb_deq(d, data));
-			free(d.dptr);
-		}
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-13-delete.c b/lib/tdb2/test/api-13-delete.c
deleted file mode 100644
index 279b38645b..0000000000
--- a/lib/tdb2/test/api-13-delete.c
+++ /dev/null
@@ -1,205 +0,0 @@
-#include "private.h" // For TDB_TOPLEVEL_HASH_BITS
-#include <ccan/hash/hash.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "tdb2.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-/* We rig the hash so adjacent-numbered records always clash. */
-static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
-{
-	return ((uint64_t)*(const unsigned int *)key)
-		<< (64 - TDB_TOPLEVEL_HASH_BITS - 1);
-}
-
-/* We use the same seed which we saw a failure on. */
-static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
-{
-	return hash64_stable((const unsigned char *)key, len,
-			     *(uint64_t *)p);
-}
-
-static bool store_records(struct tdb_context *tdb)
-{
-	int i;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data d, data = { (unsigned char *)&i, sizeof(i) };
-
-	for (i = 0; i < 1000; i++) {
-		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-			return false;
-		tdb_fetch(tdb, key, &d);
-		if (!tdb_deq(d, data))
-			return false;
-		free(d.dptr);
-	}
-	return true;
-}
-
-static void test_val(struct tdb_context *tdb, uint64_t val)
-{
-	uint64_t v;
-	struct tdb_data key = { (unsigned char *)&v, sizeof(v) };
-	struct tdb_data d, data = { (unsigned char *)&v, sizeof(v) };
-
-	/* Insert an entry, then delete it. */
-	v = val;
-	/* Delete should fail. */
-	ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Insert should succeed. */
-	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Delete should succeed. */
-	ok1(tdb_delete(tdb, key) == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Re-add it, then add collision. */
-	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-	v = val + 1;
-	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Can find both? */
-	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-	ok1(d.dsize == data.dsize);
-	free(d.dptr);
-	v = val;
-	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-	ok1(d.dsize == data.dsize);
-	free(d.dptr);
-
-	/* Delete second one. */
-	v = val + 1;
-	ok1(tdb_delete(tdb, key) == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Re-add */
-	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Now, try deleting first one. */
-	v = val;
-	ok1(tdb_delete(tdb, key) == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Can still find second? */
-	v = val + 1;
-	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-	ok1(d.dsize == data.dsize);
-	free(d.dptr);
-
-	/* Now, this will be ideally placed. */
-	v = val + 2;
-	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* This will collide with both. */
-	v = val;
-	ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-
-	/* We can still find them all, right? */
-	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-	ok1(d.dsize == data.dsize);
-	free(d.dptr);
-	v = val + 1;
-	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-	ok1(d.dsize == data.dsize);
-	free(d.dptr);
-	v = val + 2;
-	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-	ok1(d.dsize == data.dsize);
-	free(d.dptr);
-
-	/* And if we delete val + 1, that val + 2 should not move! */
-	v = val + 1;
-	ok1(tdb_delete(tdb, key) == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	v = val;
-	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-	ok1(d.dsize == data.dsize);
-	free(d.dptr);
-	v = val + 2;
-	ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-	ok1(d.dsize == data.dsize);
-	free(d.dptr);
-
-	/* Delete those two, so we are empty. */
-	ok1(tdb_delete(tdb, key) == 0);
-	v = val;
-	ok1(tdb_delete(tdb, key) == 0);
-
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	uint64_t seed = 16014841315512641303ULL;
-	union tdb_attribute clash_hattr
-		= { .hash = { .base = { TDB_ATTRIBUTE_HASH },
-			      .fn = clash } };
-	union tdb_attribute fixed_hattr
-		= { .hash = { .base = { TDB_ATTRIBUTE_HASH },
-			      .fn = fixedhash,
-			      .data = &seed } };
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	/* These two values gave trouble before. */
-	int vals[] = { 755, 837 };
-
-	clash_hattr.base.next = &tap_log_attr;
-	fixed_hattr.base.next = &tap_log_attr;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0])
-		   * (39 * 3 + 5 + sizeof(vals)/sizeof(vals[0])*2) + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-13-delete.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &clash_hattr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		/* Check start of hash table. */
-		test_val(tdb, 0);
-
-		/* Check end of hash table. */
-		test_val(tdb, -1ULL);
-
-		/* Check mixed bitpattern. */
-		test_val(tdb, 0x123456789ABCDEF0ULL);
-
-		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
-				   && tdb->file->num_lockrecs == 0));
-		tdb_close(tdb);
-
-		/* Deleting these entries in the db gave problems. */
-		tdb = tdb_open("run-13-delete.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		ok1(store_records(tdb));
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		for (j = 0; j < sizeof(vals)/sizeof(vals[0]); j++) {
-			struct tdb_data key;
-
-			key.dptr = (unsigned char *)&vals[j];
-			key.dsize = sizeof(vals[j]);
-			ok1(tdb_delete(tdb, key) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-		}
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-14-exists.c b/lib/tdb2/test/api-14-exists.c
deleted file mode 100644
index 801c295893..0000000000
--- a/lib/tdb2/test/api-14-exists.c
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-static bool test_records(struct tdb_context *tdb)
-{
-	int i;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
-
-	for (i = 0; i < 1000; i++) {
-		if (tdb_exists(tdb, key))
-			return false;
-		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-			return false;
-		if (!tdb_exists(tdb, key))
-			return false;
-	}
-
-	for (i = 0; i < 1000; i++) {
-		if (!tdb_exists(tdb, key))
-			return false;
-		if (tdb_delete(tdb, key) != 0)
-			return false;
-		if (tdb_exists(tdb, key))
-			return false;
-	}
-	return true;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-14-exists.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (ok1(tdb))
-			ok1(test_records(tdb));
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-16-wipe_all.c b/lib/tdb2/test/api-16-wipe_all.c
deleted file mode 100644
index 3dfcc7a419..0000000000
--- a/lib/tdb2/test/api-16-wipe_all.c
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-static bool add_records(struct tdb_context *tdb)
-{
-	int i;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
-
-	for (i = 0; i < 1000; i++) {
-		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-			return false;
-	}
-	return true;
-}
-
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-16-wipe_all.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (ok1(tdb)) {
-			struct tdb_data key;
-			ok1(add_records(tdb));
-			ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
-			ok1(tdb_firstkey(tdb, &key) == TDB_ERR_NOEXIST);
-			tdb_close(tdb);
-		}
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-21-parse_record.c b/lib/tdb2/test/api-21-parse_record.c
deleted file mode 100644
index 150e1c9dd0..0000000000
--- a/lib/tdb2/test/api-21-parse_record.c
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-static enum TDB_ERROR parse(TDB_DATA key, TDB_DATA data, TDB_DATA *expected)
-{
-	if (!tdb_deq(data, *expected))
-		return TDB_ERR_EINVAL;
-	return TDB_SUCCESS;
-}
-
-static enum TDB_ERROR parse_err(TDB_DATA key, TDB_DATA data, void *unused)
-{
-	return 100;
-}
-
-static bool test_records(struct tdb_context *tdb)
-{
-	int i;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
-
-	for (i = 0; i < 1000; i++) {
-		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-			return false;
-	}
-
-	for (i = 0; i < 1000; i++) {
-		if (tdb_parse_record(tdb, key, parse, &data) != TDB_SUCCESS)
-			return false;
-	}
-
-	if (tdb_parse_record(tdb, key, parse, &data) != TDB_ERR_NOEXIST)
-		return false;
-
-	/* Test error return from parse function. */
-	i = 0;
-	if (tdb_parse_record(tdb, key, parse_err, NULL) != 100)
-		return false;
-
-	return true;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("api-21-parse_record.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (ok1(tdb))
-			ok1(test_records(tdb));
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-55-transaction.c b/lib/tdb2/test/api-55-transaction.c
deleted file mode 100644
index c474c6abc3..0000000000
--- a/lib/tdb2/test/api-55-transaction.c
+++ /dev/null
@@ -1,73 +0,0 @@
-#include "private.h" // struct tdb_context
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	unsigned char *buffer;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data;
-
-	buffer = malloc(1000);
-	for (i = 0; i < 1000; i++)
-		buffer[i] = i;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 20 + 1);
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-55-transaction.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		ok1(tdb_transaction_start(tdb) == 0);
-		data.dptr = buffer;
-		data.dsize = 1000;
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
-		ok1(data.dsize == 1000);
-		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
-		free(data.dptr);
-
-		/* Cancelling a transaction means no store */
-		tdb_transaction_cancel(tdb);
-		ok1(tdb->file->allrecord_lock.count == 0
-		    && tdb->file->num_lockrecs == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(tdb_fetch(tdb, key, &data) == TDB_ERR_NOEXIST);
-
-		/* Commit the transaction. */
-		ok1(tdb_transaction_start(tdb) == 0);
-		data.dptr = buffer;
-		data.dsize = 1000;
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
-		ok1(data.dsize == 1000);
-		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
-		free(data.dptr);
-		ok1(tdb_transaction_commit(tdb) == 0);
-		ok1(tdb->file->allrecord_lock.count == 0
-		    && tdb->file->num_lockrecs == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
-		ok1(data.dsize == 1000);
-		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
-		free(data.dptr);
-
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	free(buffer);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-80-tdb_fd.c b/lib/tdb2/test/api-80-tdb_fd.c
deleted file mode 100644
index 63967b8aa6..0000000000
--- a/lib/tdb2/test/api-80-tdb_fd.c
+++ /dev/null
@@ -1,32 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("api-80-tdb_fd.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			continue;
-
-		if (flags[i] & TDB_INTERNAL)
-			ok1(tdb_fd(tdb) == -1);
-		else
-			ok1(tdb_fd(tdb) > 2);
-		tdb_close(tdb);
-		ok1(tap_log_messages == 0);
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-81-seqnum.c b/lib/tdb2/test/api-81-seqnum.c
deleted file mode 100644
index 8bf261d635..0000000000
--- a/lib/tdb2/test/api-81-seqnum.c
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, seq;
-	struct tdb_context *tdb;
-	struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15 + 4 * 13);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("api-81-seqnum.tdb", flags[i]|TDB_SEQNUM,
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			continue;
-
-		seq = 0;
-		ok1(tdb_get_seqnum(tdb) == seq);
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-		ok1(tdb_get_seqnum(tdb) == ++seq);
-		/* Fetch doesn't change seqnum */
-		if (ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS))
-			free(d.dptr);
-		ok1(tdb_get_seqnum(tdb) == seq);
-		ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
-		ok1(tdb_get_seqnum(tdb) == ++seq);
-
-		ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
-		ok1(tdb_get_seqnum(tdb) == ++seq);
-		/* Empty append works */
-		ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
-		ok1(tdb_get_seqnum(tdb) == ++seq);
-
-		ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
-		ok1(tdb_get_seqnum(tdb) == ++seq);
-
-		if (!(flags[i] & TDB_INTERNAL)) {
-			ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
-			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-			ok1(tdb_get_seqnum(tdb) == ++seq);
-			ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
-			ok1(tdb_get_seqnum(tdb) == ++seq);
-			ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
-			ok1(tdb_get_seqnum(tdb) == ++seq);
-			ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS);
-			ok1(tdb_get_seqnum(tdb) == seq);
-
-			ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
-			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-			ok1(tdb_get_seqnum(tdb) == seq + 1);
-			tdb_transaction_cancel(tdb);
-			ok1(tdb_get_seqnum(tdb) == seq);
-		}
-		tdb_close(tdb);
-		ok1(tap_log_messages == 0);
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-82-lockattr.c b/lib/tdb2/test/api-82-lockattr.c
deleted file mode 100644
index b229eab83c..0000000000
--- a/lib/tdb2/test/api-82-lockattr.c
+++ /dev/null
@@ -1,237 +0,0 @@
-#include "private.h" // for tdb_fcntl_unlock
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-#include "logging.h"
-
-static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
-		  void *_err)
-{
-	int *lock_err = _err;
-	struct flock fl;
-	int ret;
-
-	if (*lock_err) {
-		errno = *lock_err;
-		return -1;
-	}
-
-	do {
-		fl.l_type = rw;
-		fl.l_whence = SEEK_SET;
-		fl.l_start = off;
-		fl.l_len = len;
-
-		if (waitflag)
-			ret = fcntl(fd, F_SETLKW, &fl);
-		else
-			ret = fcntl(fd, F_SETLK, &fl);
-	} while (ret != 0 && errno == EINTR);
-
-	return ret;
-}
-
-static int trav_err;
-static int trav(struct tdb_context *tdb, TDB_DATA k, TDB_DATA d, int *terr)
-{
-	*terr = trav_err;
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	union tdb_attribute lock_attr;
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-	int lock_err;
-
-	lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK;
-	lock_attr.base.next = &tap_log_attr;
-	lock_attr.flock.lock = mylock;
-	lock_attr.flock.unlock = tdb_fcntl_unlock;
-	lock_attr.flock.data = &lock_err;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 80);
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		struct tdb_data d;
-
-		/* Nonblocking open; expect no error message. */
-		lock_err = EAGAIN;
-		tdb = tdb_open("run-82-lockattr.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
-		ok(errno == lock_err, "Errno is %u", errno);
-		ok1(!tdb);
-		ok1(tap_log_messages == 0);
-
-		lock_err = EINTR;
-		tdb = tdb_open("run-82-lockattr.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
-		ok(errno == lock_err, "Errno is %u", errno);
-		ok1(!tdb);
-		ok1(tap_log_messages == 0);
-
-		/* Forced fail open. */
-		lock_err = ENOMEM;
-		tdb = tdb_open("run-82-lockattr.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
-		ok1(errno == lock_err);
-		ok1(!tdb);
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		lock_err = 0;
-		tdb = tdb_open("run-82-lockattr.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
-		if (!ok1(tdb))
-			continue;
-		ok1(tap_log_messages == 0);
-
-		/* Nonblocking store. */
-		lock_err = EAGAIN;
-		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		/* Nonblocking fetch. */
-		lock_err = EAGAIN;
-		ok1(!tdb_exists(tdb, key));
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(!tdb_exists(tdb, key));
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(!tdb_exists(tdb, key));
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		lock_err = EAGAIN;
-		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		/* Nonblocking delete. */
-		lock_err = EAGAIN;
-		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		/* Nonblocking locks. */
-		lock_err = EAGAIN;
-		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		lock_err = EAGAIN;
-		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		lock_err = EAGAIN;
-		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
-		/* This actually does divide and conquer. */
-		ok1(tap_log_messages > 0);
-		tap_log_messages = 0;
-
-		lock_err = EAGAIN;
-		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages > 0);
-		tap_log_messages = 0;
-
-		/* Nonblocking traverse; go nonblock partway through. */
-		lock_err = 0;
-		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
-		trav_err = EAGAIN;
-		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		trav_err = EINTR;
-		lock_err = 0;
-		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		trav_err = ENOMEM;
-		lock_err = 0;
-		ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		/* Nonblocking transactions. */
-		lock_err = EAGAIN;
-		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = EINTR;
-		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-		lock_err = ENOMEM;
-		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 1);
-		tap_log_messages = 0;
-
-		/* Nonblocking transaction prepare. */
-		lock_err = 0;
-		ok1(tdb_transaction_start(tdb) == 0);
-		ok1(tdb_delete(tdb, key) == 0);
-
-		lock_err = EAGAIN;
-		ok1(tdb_transaction_prepare_commit(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-
-		lock_err = 0;
-		ok1(tdb_transaction_prepare_commit(tdb) == 0);
-		ok1(tdb_transaction_commit(tdb) == 0);
-
-		/* And the transaction was committed, right? */
-		ok1(!tdb_exists(tdb, key));
-		tdb_close(tdb);
-		ok1(tap_log_messages == 0);
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-83-openhook.c b/lib/tdb2/test/api-83-openhook.c
deleted file mode 100644
index 191cf068c1..0000000000
--- a/lib/tdb2/test/api-83-openhook.c
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include "external-agent.h"
-#include "logging.h"
-
-static enum TDB_ERROR clear_if_first(int fd, void *arg)
-{
-/* We hold a lock offset 4 always, so we can tell if anyone is holding it.
- * (This is compatible with tdb1's TDB_CLEAR_IF_FIRST flag).  */
-	struct flock fl;
-
-	if (arg != clear_if_first)
-		return TDB_ERR_CORRUPT;
-
-	fl.l_type = F_WRLCK;
-	fl.l_whence = SEEK_SET;
-	fl.l_start = 4;
-	fl.l_len = 1;
-
-	if (fcntl(fd, F_SETLK, &fl) == 0) {
-		/* We must be first ones to open it! */
-		diag("truncating file!");
-		if (ftruncate(fd, 0) != 0) {
-			return TDB_ERR_IO;
-		}
-	}
-	fl.l_type = F_RDLCK;
-	if (fcntl(fd, F_SETLKW, &fl) != 0) {
-		return TDB_ERR_IO;
-	}
-	return TDB_SUCCESS;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	struct agent *agent;
-	union tdb_attribute cif;
-	struct tdb_data key = tdb_mkdata("key", 3);
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-
-	cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
-	cif.openhook.base.next = &tap_log_attr;
-	cif.openhook.fn = clear_if_first;
-	cif.openhook.data = clear_if_first;
-
-	agent = prepare_external_agent();
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 13);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		/* Create it */
-		tdb = tdb_open("run-83-openhook.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
-		ok1(tdb);
-		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
-		tdb_close(tdb);
-
-		/* Now, open with CIF, should clear it. */
-		tdb = tdb_open("run-83-openhook.tdb", flags[i],
-			       O_RDWR, 0, &cif);
-		ok1(tdb);
-		ok1(!tdb_exists(tdb, key));
-		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
-
-		/* Agent should not clear it, since it's still open. */
-		ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
-					     "run-83-openhook.tdb") == SUCCESS);
-		ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
-		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
-
-		/* Still exists for us too. */
-		ok1(tdb_exists(tdb, key));
-
-		/* Close it, now agent should clear it. */
-		tdb_close(tdb);
-
-		ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
-					     "run-83-openhook.tdb") == SUCCESS);
-		ok1(external_agent_operation(agent, FETCH, "key") == FAILED);
-		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
-
-		ok1(tap_log_messages == 0);
-	}
-
-	free_external_agent(agent);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-91-get-stats.c b/lib/tdb2/test/api-91-get-stats.c
deleted file mode 100644
index 395db3fb18..0000000000
--- a/lib/tdb2/test/api-91-get-stats.c
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		union tdb_attribute *attr;
-		struct tdb_data key = tdb_mkdata("key", 3);
-
-		tdb = tdb_open("run-91-get-stats.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
-
-		/* Use malloc so valgrind will catch overruns. */
-		attr = malloc(sizeof *attr);
-		attr->stats.base.attr = TDB_ATTRIBUTE_STATS;
-		attr->stats.size = sizeof(*attr);
-
-		ok1(tdb_get_attribute(tdb, attr) == 0);
-		ok1(attr->stats.size == sizeof(*attr));
-		ok1(attr->stats.allocs > 0);
-		ok1(attr->stats.expands > 0);
-		ok1(attr->stats.locks > 0);
-		free(attr);
-
-		/* Try short one. */
-		attr = malloc(offsetof(struct tdb_attribute_stats, allocs)
-			      + sizeof(attr->stats.allocs));
-		attr->stats.base.attr = TDB_ATTRIBUTE_STATS;
-		attr->stats.size = offsetof(struct tdb_attribute_stats, allocs)
-			+ sizeof(attr->stats.allocs);
-		ok1(tdb_get_attribute(tdb, attr) == 0);
-		ok1(attr->stats.size == sizeof(*attr));
-		ok1(attr->stats.allocs > 0);
-		free(attr);
-		ok1(tap_log_messages == 0);
-
-		tdb_close(tdb);
-
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-92-get-set-readonly.c b/lib/tdb2/test/api-92-get-set-readonly.c
deleted file mode 100644
index 46aea7ae0d..0000000000
--- a/lib/tdb2/test/api-92-get-set-readonly.c
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 48);
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		/* RW -> R0 */
-		tdb = tdb_open("run-92-get-set-readonly.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		ok1(!(tdb_get_flags(tdb) & TDB_RDONLY));
-
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS);
-
-		tdb_add_flag(tdb, TDB_RDONLY);
-		ok1(tdb_get_flags(tdb) & TDB_RDONLY);
-
-		/* Can't store, append, delete. */
-		ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 1);
-		ok1(tdb_append(tdb, key, data) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 2);
-		ok1(tdb_delete(tdb, key) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 3);
-
-		/* Can't start a transaction, or any write lock. */
-		ok1(tdb_transaction_start(tdb) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 4);
-		ok1(tdb_chainlock(tdb, key) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 5);
-		ok1(tdb_lockall(tdb) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 6);
-		ok1(tdb_wipe_all(tdb) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 7);
-
-		/* Back to RW. */
-		tdb_remove_flag(tdb, TDB_RDONLY);
-		ok1(!(tdb_get_flags(tdb) & TDB_RDONLY));
-
-		ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_SUCCESS);
-		ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
-		ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
-
-		ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS);
-		ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS);
-
-		ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS);
-		tdb_chainunlock(tdb, key);
-		ok1(tdb_lockall(tdb) == TDB_SUCCESS);
-		tdb_unlockall(tdb);
-		ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
-		ok1(tap_log_messages == 7);
-
-		tdb_close(tdb);
-
-		/* R0 -> RW */
-		tdb = tdb_open("run-92-get-set-readonly.tdb", flags[i],
-			       O_RDONLY, 0600, &tap_log_attr);
-		ok1(tdb);
-		ok1(tdb_get_flags(tdb) & TDB_RDONLY);
-
-		/* Can't store, append, delete. */
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 8);
-		ok1(tdb_append(tdb, key, data) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 9);
-		ok1(tdb_delete(tdb, key) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 10);
-
-		/* Can't start a transaction, or any write lock. */
-		ok1(tdb_transaction_start(tdb) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 11);
-		ok1(tdb_chainlock(tdb, key) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 12);
-		ok1(tdb_lockall(tdb) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 13);
-		ok1(tdb_wipe_all(tdb) == TDB_ERR_RDONLY);
-		ok1(tap_log_messages == 14);
-
-		/* Can't remove TDB_RDONLY since we opened with O_RDONLY */
-		tdb_remove_flag(tdb, TDB_RDONLY);
-		ok1(tap_log_messages == 15);
-		ok1(tdb_get_flags(tdb) & TDB_RDONLY);
-		tdb_close(tdb);
-
-		ok1(tap_log_messages == 15);
-		tap_log_messages = 0;
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-93-repack.c b/lib/tdb2/test/api-93-repack.c
deleted file mode 100644
index 910eb9b301..0000000000
--- a/lib/tdb2/test/api-93-repack.c
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-#define NUM_TESTS 1000
-
-static bool store_all(struct tdb_context *tdb)
-{
-	unsigned int i;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data dbuf = { (unsigned char *)&i, sizeof(i) };
-
-	for (i = 0; i < NUM_TESTS; i++) {
-		if (tdb_store(tdb, key, dbuf, TDB_INSERT) != TDB_SUCCESS)
-			return false;
-	}
-	return true;
-}
-
-static int mark_entry(struct tdb_context *tdb,
-		      TDB_DATA key, TDB_DATA data, bool found[])
-{
-	unsigned int num;
-
-	if (key.dsize != sizeof(num))
-		return -1;
-	memcpy(&num, key.dptr, key.dsize);
-	if (num >= NUM_TESTS)
-		return -1;
-	if (found[num])
-		return -1;
-	found[num] = true;
-	return 0;
-}
-
-static bool is_all_set(bool found[], unsigned int num)
-{
-	unsigned int i;
-
-	for (i = 0; i < num; i++)
-		if (!found[i])
-			return false;
-	return true;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	bool found[NUM_TESTS];
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT
-	};
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 6 + 1);
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-93-repack.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			break;
-
-		ok1(store_all(tdb));
-
-		ok1(tdb_repack(tdb) == TDB_SUCCESS);
-		memset(found, 0, sizeof(found));
-		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-		ok1(tdb_traverse(tdb, mark_entry, found) == NUM_TESTS);
-		ok1(is_all_set(found, NUM_TESTS));
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-add-remove-flags.c b/lib/tdb2/test/api-add-remove-flags.c
deleted file mode 100644
index a72b609fcb..0000000000
--- a/lib/tdb2/test/api-add-remove-flags.c
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "private.h" // for tdb_context
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(87);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-add-remove-flags.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		ok1(tdb_get_flags(tdb) == tdb->flags);
-		tap_log_messages = 0;
-		tdb_add_flag(tdb, TDB_NOLOCK);
-		if (flags[i] & TDB_INTERNAL)
-			ok1(tap_log_messages == 1);
-		else {
-			ok1(tap_log_messages == 0);
-			ok1(tdb_get_flags(tdb) & TDB_NOLOCK);
-		}
-
-		tap_log_messages = 0;
-		tdb_add_flag(tdb, TDB_NOMMAP);
-		if (flags[i] & TDB_INTERNAL)
-			ok1(tap_log_messages == 1);
-		else {
-			ok1(tap_log_messages == 0);
-			ok1(tdb_get_flags(tdb) & TDB_NOMMAP);
-			ok1(tdb->file->map_ptr == NULL);
-		}
-
-		tap_log_messages = 0;
-		tdb_add_flag(tdb, TDB_NOSYNC);
-		if (flags[i] & TDB_INTERNAL)
-			ok1(tap_log_messages == 1);
-		else {
-			ok1(tap_log_messages == 0);
-			ok1(tdb_get_flags(tdb) & TDB_NOSYNC);
-		}
-
-		ok1(tdb_get_flags(tdb) == tdb->flags);
-
-		tap_log_messages = 0;
-		tdb_remove_flag(tdb, TDB_NOLOCK);
-		if (flags[i] & TDB_INTERNAL)
-			ok1(tap_log_messages == 1);
-		else {
-			ok1(tap_log_messages == 0);
-			ok1(!(tdb_get_flags(tdb) & TDB_NOLOCK));
-		}
-
-		tap_log_messages = 0;
-		tdb_remove_flag(tdb, TDB_NOMMAP);
-		if (flags[i] & TDB_INTERNAL)
-			ok1(tap_log_messages == 1);
-		else {
-			ok1(tap_log_messages == 0);
-			ok1(!(tdb_get_flags(tdb) & TDB_NOMMAP));
-			ok1(tdb->file->map_ptr != NULL);
-		}
-
-		tap_log_messages = 0;
-		tdb_remove_flag(tdb, TDB_NOSYNC);
-		if (flags[i] & TDB_INTERNAL)
-			ok1(tap_log_messages == 1);
-		else {
-			ok1(tap_log_messages == 0);
-			ok1(!(tdb_get_flags(tdb) & TDB_NOSYNC));
-		}
-
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-check-callback.c b/lib/tdb2/test/api-check-callback.c
deleted file mode 100644
index 96ef09f3bd..0000000000
--- a/lib/tdb2/test/api-check-callback.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-#define NUM_RECORDS 1000
-
-static bool store_records(struct tdb_context *tdb)
-{
-	int i;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
-
-	for (i = 0; i < NUM_RECORDS; i++)
-		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-			return false;
-	return true;
-}
-
-static enum TDB_ERROR check(struct tdb_data key,
-			    struct tdb_data data,
-			    bool *array)
-{
-	int val;
-
-	if (key.dsize != sizeof(val)) {
-		diag("Wrong key size: %u\n", key.dsize);
-		return TDB_ERR_CORRUPT;
-	}
-
-	if (key.dsize != data.dsize
-	    || memcmp(key.dptr, data.dptr, sizeof(val)) != 0) {
-		diag("Key and data differ\n");
-		return TDB_ERR_CORRUPT;
-	}
-
-	memcpy(&val, key.dptr, sizeof(val));
-	if (val >= NUM_RECORDS || val < 0) {
-		diag("check value %i\n", val);
-		return TDB_ERR_CORRUPT;
-	}
-
-	if (array[val]) {
-		diag("Value %i already seen\n", val);
-		return TDB_ERR_CORRUPT;
-	}
-
-	array[val] = true;
-	return TDB_SUCCESS;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		bool array[NUM_RECORDS];
-
-		tdb = tdb_open("run-check-callback.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		ok1(store_records(tdb));
-		for (j = 0; j < NUM_RECORDS; j++)
-			array[j] = false;
-		ok1(tdb_check(tdb, check, array) == TDB_SUCCESS);
-		for (j = 0; j < NUM_RECORDS; j++)
-			if (!array[j])
-				break;
-		ok1(j == NUM_RECORDS);
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-firstkey-nextkey.c b/lib/tdb2/test/api-firstkey-nextkey.c
deleted file mode 100644
index e5a7c5f8b5..0000000000
--- a/lib/tdb2/test/api-firstkey-nextkey.c
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-
-#define NUM_RECORDS 1000
-
-static bool store_records(struct tdb_context *tdb)
-{
-	int i;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
-
-	for (i = 0; i < NUM_RECORDS; i++)
-		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-			return false;
-	return true;
-}
-
-struct trav_data {
-	unsigned int records[NUM_RECORDS];
-	unsigned int calls;
-};
-
-static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
-{
-	struct trav_data *td = p;
-	int val;
-
-	memcpy(&val, dbuf.dptr, dbuf.dsize);
-	td->records[td->calls++] = val;
-	return 0;
-}
-
-/* Since tdb_nextkey frees dptr, we need to clone it. */
-static TDB_DATA dup_key(TDB_DATA key)
-{
-	void *p = malloc(key.dsize);
-	memcpy(p, key.dptr, key.dsize);
-	key.dptr = p;
-	return key;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	int num;
-	struct trav_data td;
-	TDB_DATA k;
-	struct tdb_context *tdb;
-	union tdb_attribute seed_attr;
-	enum TDB_ERROR ecode;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
-	seed_attr.base.next = &tap_log_attr;
-	seed_attr.seed.seed = 6334326220117065685ULL;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0])
-		   * (NUM_RECORDS*6 + (NUM_RECORDS-1)*3 + 22) + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("api-firstkey-nextkey.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600,
-			       &seed_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		ok1(tdb_firstkey(tdb, &k) == TDB_ERR_NOEXIST);
-
-		/* One entry... */
-		k.dptr = (unsigned char *)&num;
-		k.dsize = sizeof(num);
-		num = 0;
-		ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0);
-		ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS);
-		ok1(k.dsize == sizeof(num));
-		ok1(memcmp(k.dptr, &num, sizeof(num)) == 0);
-		ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST);
-
-		/* Two entries. */
-		k.dptr = (unsigned char *)&num;
-		k.dsize = sizeof(num);
-		num = 1;
-		ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0);
-		ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS);
-		ok1(k.dsize == sizeof(num));
-		memcpy(&num, k.dptr, sizeof(num));
-		ok1(num == 0 || num == 1);
-		ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS);
-		ok1(k.dsize == sizeof(j));
-		memcpy(&j, k.dptr, sizeof(j));
-		ok1(j == 0 || j == 1);
-		ok1(j != num);
-		ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST);
-
-		/* Clean up. */
-		k.dptr = (unsigned char *)&num;
-		k.dsize = sizeof(num);
-		num = 0;
-		ok1(tdb_delete(tdb, k) == 0);
-		num = 1;
-		ok1(tdb_delete(tdb, k) == 0);
-
-		/* Now lots of records. */
-		ok1(store_records(tdb));
-		td.calls = 0;
-
-		num = tdb_traverse(tdb, trav, &td);
-		ok1(num == NUM_RECORDS);
-		ok1(td.calls == NUM_RECORDS);
-
-		/* Simple loop should match tdb_traverse */
-		for (j = 0, ecode = tdb_firstkey(tdb, &k); j < td.calls; j++) {
-			int val;
-
-			ok1(ecode == TDB_SUCCESS);
-			ok1(k.dsize == sizeof(val));
-			memcpy(&val, k.dptr, k.dsize);
-			ok1(td.records[j] == val);
-			ecode = tdb_nextkey(tdb, &k);
-		}
-
-		/* But arbitrary orderings should work too. */
-		for (j = td.calls-1; j > 0; j--) {
-			k.dptr = (unsigned char *)&td.records[j-1];
-			k.dsize = sizeof(td.records[j-1]);
-			k = dup_key(k);
-			ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS);
-			ok1(k.dsize == sizeof(td.records[j]));
-			ok1(memcmp(k.dptr, &td.records[j], k.dsize) == 0);
-			free(k.dptr);
-		}
-
-		/* Even delete should work. */
-		for (j = 0, ecode = tdb_firstkey(tdb, &k);
-		     ecode != TDB_ERR_NOEXIST;
-		     j++) {
-			ok1(ecode == TDB_SUCCESS);
-			ok1(k.dsize == 4);
-			ok1(tdb_delete(tdb, k) == 0);
-			ecode = tdb_nextkey(tdb, &k);
-		}
-
-		diag("delete using first/nextkey gave %u of %u records",
-		     j, NUM_RECORDS);
-		ok1(j == NUM_RECORDS);
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-fork-test.c b/lib/tdb2/test/api-fork-test.c
deleted file mode 100644
index 934c71cbe8..0000000000
--- a/lib/tdb2/test/api-fork-test.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Test forking while holding lock.
- *
- * There are only five ways to do this currently:
- * (1) grab a tdb_chainlock, then fork.
- * (2) grab a tdb_lockall, then fork.
- * (3) grab a tdb_lockall_read, then fork.
- * (4) start a transaction, then fork.
- * (5) fork from inside a tdb_parse() callback.
- *
- * Note that we don't hold a lock across tdb_traverse callbacks, so
- * that doesn't matter.
- */
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include "logging.h"
-
-static enum TDB_ERROR fork_in_parse(TDB_DATA key, TDB_DATA data,
-				    struct tdb_context *tdb)
-{
-	int status;
-
-	if (fork() == 0) {
-		/* We expect this to fail. */
-		if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
-			exit(1);
-
-		if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
-			exit(1);
-
-		if (tap_log_messages != 2)
-			exit(2);
-
-		tdb_close(tdb);
-		if (tap_log_messages != 2)
-			exit(3);
-		exit(0);
-	}
-	wait(&status);
-	ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
-	return TDB_SUCCESS;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		int status;
-
-		tap_log_messages = 0;
-
-		tdb = tdb_open("run-fork-test.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			continue;
-
-		/* Put a record in here. */
-		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_SUCCESS);
-
-		ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS);
-		if (fork() == 0) {
-			/* We expect this to fail. */
-			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
-				return 1;
-
-			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
-				return 1;
-
-			if (tap_log_messages != 2)
-				return 2;
-
-			tdb_chainunlock(tdb, key);
-			if (tap_log_messages != 3)
-				return 3;
-			tdb_close(tdb);
-			if (tap_log_messages != 3)
-				return 4;
-			return 0;
-		}
-		wait(&status);
-		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
-		tdb_chainunlock(tdb, key);
-
-		ok1(tdb_lockall(tdb) == TDB_SUCCESS);
-		if (fork() == 0) {
-			/* We expect this to fail. */
-			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
-				return 1;
-
-			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
-				return 1;
-
-			if (tap_log_messages != 2)
-				return 2;
-
-			tdb_unlockall(tdb);
-			if (tap_log_messages != 2)
-				return 3;
-			tdb_close(tdb);
-			if (tap_log_messages != 2)
-				return 4;
-			return 0;
-		}
-		wait(&status);
-		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
-		tdb_unlockall(tdb);
-
-		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
-		if (fork() == 0) {
-			/* We expect this to fail. */
-			/* This would always fail anyway... */
-			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
-				return 1;
-
-			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
-				return 1;
-
-			if (tap_log_messages != 2)
-				return 2;
-
-			tdb_unlockall_read(tdb);
-			if (tap_log_messages != 2)
-				return 3;
-			tdb_close(tdb);
-			if (tap_log_messages != 2)
-				return 4;
-			return 0;
-		}
-		wait(&status);
-		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
-		tdb_unlockall_read(tdb);
-
-		ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
-		/* If transactions is empty, noop "commit" succeeds. */
-		ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
-		if (fork() == 0) {
-			/* We expect this to fail. */
-			if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
-				return 1;
-
-			if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
-				return 1;
-
-			if (tap_log_messages != 2)
-				return 2;
-
-			if (tdb_transaction_commit(tdb) != TDB_ERR_LOCK)
-				return 3;
-
-			tdb_close(tdb);
-			if (tap_log_messages < 3)
-				return 4;
-			return 0;
-		}
-		wait(&status);
-		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
-		tdb_transaction_cancel(tdb);
-
-		ok1(tdb_parse_record(tdb, key, fork_in_parse, tdb)
-		    == TDB_SUCCESS);
-		tdb_close(tdb);
-		ok1(tap_log_messages == 0);
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-locktimeout.c b/lib/tdb2/test/api-locktimeout.c
deleted file mode 100644
index dabe262f25..0000000000
--- a/lib/tdb2/test/api-locktimeout.c
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include "system/wait.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <errno.h>
-#include "logging.h"
-#include "external-agent.h"
-
-#undef alarm
-#define alarm fast_alarm
-
-/* Speed things up by doing things in milliseconds. */
-static unsigned int fast_alarm(unsigned int milli_seconds)
-{
-	struct itimerval it;
-
-	it.it_interval.tv_sec = it.it_interval.tv_usec = 0;
-	it.it_value.tv_sec = milli_seconds / 1000;
-	it.it_value.tv_usec = milli_seconds * 1000;
-	setitimer(ITIMER_REAL, &it, NULL);
-	return 0;
-}
-
-#define CatchSignal(sig, handler) signal((sig), (handler))
-
-static void do_nothing(int signum)
-{
-}
-
-/* This example code is taken from SAMBA, so try not to change it. */
-static struct flock flock_struct;
-
-/* Return a value which is none of v1, v2 or v3. */
-static inline short int invalid_value(short int v1, short int v2, short int v3)
-{
-	short int try = (v1+v2+v3)^((v1+v2+v3) << 16);
-	while (try == v1 || try == v2 || try == v3)
-		try++;
-	return try;
-}
-
-/* We invalidate in as many ways as we can, so the OS rejects it */
-static void invalidate_flock_struct(int signum)
-{
-	flock_struct.l_type = invalid_value(F_RDLCK, F_WRLCK, F_UNLCK);
-	flock_struct.l_whence = invalid_value(SEEK_SET, SEEK_CUR, SEEK_END);
-	flock_struct.l_start = -1;
-	/* A large negative. */
-	flock_struct.l_len = (((off_t)1 << (sizeof(off_t)*CHAR_BIT - 1)) + 1);
-}
-
-static int timeout_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
-			void *_timeout)
-{
-	int ret, saved_errno = errno;
-	unsigned int timeout = *(unsigned int *)_timeout;
-
-	flock_struct.l_type = rw;
-	flock_struct.l_whence = SEEK_SET;
-	flock_struct.l_start = off;
-	flock_struct.l_len = len;
-
-	CatchSignal(SIGALRM, invalidate_flock_struct);
-	alarm(timeout);
-
-	for (;;) {
-		if (waitflag)
-			ret = fcntl(fd, F_SETLKW, &flock_struct);
-		else
-			ret = fcntl(fd, F_SETLK, &flock_struct);
-
-		if (ret == 0)
-			break;
-
-		/* Not signalled?  Something else went wrong. */
-		if (flock_struct.l_len == len) {
-			if (errno == EAGAIN || errno == EINTR)
-				continue;
-			saved_errno = errno;
-			break;
-		} else {
-			saved_errno = EINTR;
-			break;
-		}
-	}
-
-	alarm(0);
-	errno = saved_errno;
-	return ret;
-}
-
-static int tdb_chainlock_with_timeout_internal(struct tdb_context *tdb,
-					       TDB_DATA key,
-					       unsigned int timeout,
-					       int rw_type)
-{
-	union tdb_attribute locking;
-	enum TDB_ERROR ecode;
-
-	if (timeout) {
-		locking.base.attr = TDB_ATTRIBUTE_FLOCK;
-		ecode = tdb_get_attribute(tdb, &locking);
-		if (ecode != TDB_SUCCESS)
-			return ecode;
-
-		/* Replace locking function with our own. */
-		locking.flock.data = &timeout;
-		locking.flock.lock = timeout_lock;
-
-		ecode = tdb_set_attribute(tdb, &locking);
-		if (ecode != TDB_SUCCESS)
-			return ecode;
-	}
-	if (rw_type == F_RDLCK)
-		ecode = tdb_chainlock_read(tdb, key);
-	else
-		ecode = tdb_chainlock(tdb, key);
-
-	if (timeout) {
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
-	}
-	return ecode;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	TDB_DATA key = tdb_mkdata("hello", 5);
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	struct agent *agent;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15);
-
-	agent = prepare_external_agent();
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		enum TDB_ERROR ecode;
-		tdb = tdb_open("run-locktimeout.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			break;
-
-		/* Simple cases: should succeed. */
-		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
-							    F_RDLCK);
-		ok1(ecode == TDB_SUCCESS);
-		ok1(tap_log_messages == 0);
-
-		tdb_chainunlock_read(tdb, key);
-		ok1(tap_log_messages == 0);
-
-		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
-							    F_WRLCK);
-		ok1(ecode == TDB_SUCCESS);
-		ok1(tap_log_messages == 0);
-
-		tdb_chainunlock(tdb, key);
-		ok1(tap_log_messages == 0);
-
-		/* OK, get agent to start transaction, then we should time out. */
-		ok1(external_agent_operation(agent, OPEN, "run-locktimeout.tdb")
-		    == SUCCESS);
-		ok1(external_agent_operation(agent, TRANSACTION_START, "")
-		    == SUCCESS);
-		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
-							    F_WRLCK);
-		ok1(ecode == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-
-		/* Even if we get a different signal, should be fine. */
-		CatchSignal(SIGUSR1, do_nothing);
-		external_agent_operation(agent, SEND_SIGNAL, "");
-		ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
-							    F_WRLCK);
-		ok1(ecode == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 0);
-
-		ok1(external_agent_operation(agent, TRANSACTION_COMMIT, "")
-		    == SUCCESS);
-		ok1(external_agent_operation(agent, CLOSE, "")
-		    == SUCCESS);
-		tdb_close(tdb);
-	}
-	free_external_agent(agent);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-missing-entries.c b/lib/tdb2/test/api-missing-entries.c
deleted file mode 100644
index c81839bc05..0000000000
--- a/lib/tdb2/test/api-missing-entries.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Another test revealed that we lost an entry.  This reproduces it. */
-#include "config.h"
-#include "tdb2.h"
-#include <ccan/hash/hash.h>
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-#define NUM_RECORDS 1189
-
-/* We use the same seed which we saw this failure on. */
-static uint64_t failhash(const void *key, size_t len, uint64_t seed, void *p)
-{
-	seed = 699537674708983027ULL;
-	return hash64_stable((const unsigned char *)key, len, seed);
-}
-
-int main(int argc, char *argv[])
-{
-	int i;
-	struct tdb_context *tdb;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
-	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
-						.fn = failhash } };
-
-	hattr.base.next = &tap_log_attr;
-	plan_tests(1 + NUM_RECORDS + 2);
-
-	tdb = tdb_open("run-missing-entries.tdb", TDB_INTERNAL,
-		       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
-	if (ok1(tdb)) {
-		for (i = 0; i < NUM_RECORDS; i++) {
-			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
-		}
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-open-multiple-times.c b/lib/tdb2/test/api-open-multiple-times.c
deleted file mode 100644
index 38aea135ac..0000000000
--- a/lib/tdb2/test/api-open-multiple-times.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb, *tdb2;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 28);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-open-multiple-times.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		tdb2 = tdb_open("run-open-multiple-times.tdb", flags[i],
-				O_RDWR|O_CREAT, 0600, &tap_log_attr);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(tdb_check(tdb2, NULL, NULL) == 0);
-
-		/* Store in one, fetch in the other. */
-		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
-		ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS);
-		ok1(tdb_deq(d, data));
-		free(d.dptr);
-
-		/* Vice versa, with delete. */
-		ok1(tdb_delete(tdb2, key) == 0);
-		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST);
-
-		/* OK, now close first one, check second still good. */
-		ok1(tdb_close(tdb) == 0);
-
-		ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == 0);
-		ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS);
-		ok1(tdb_deq(d, data));
-		free(d.dptr);
-
-		/* Reopen */
-		tdb = tdb_open("run-open-multiple-times.tdb", flags[i],
-			       O_RDWR|O_CREAT, 0600, &tap_log_attr);
-		ok1(tdb);
-
-		ok1(tdb_transaction_start(tdb2) == 0);
-
-		/* Anything in the other one should fail. */
-		ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 1);
-		ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 2);
-		ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 3);
-		ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
-		ok1(tap_log_messages == 4);
-
-		/* Transaciton should work as normal. */
-		ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == TDB_SUCCESS);
-
-		/* Now... try closing with locks held. */
-		ok1(tdb_close(tdb2) == 0);
-
-		ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-		ok1(tdb_deq(d, data));
-		free(d.dptr);
-		ok1(tdb_close(tdb) == 0);
-		ok1(tap_log_messages == 4);
-		tap_log_messages = 0;
-	}
-
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-record-expand.c b/lib/tdb2/test/api-record-expand.c
deleted file mode 100644
index 34799ebe5e..0000000000
--- a/lib/tdb2/test/api-record-expand.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-
-#define MAX_SIZE 10000
-#define SIZE_STEP 131
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data;
-
-	data.dptr = malloc(MAX_SIZE);
-	memset(data.dptr, 0x24, MAX_SIZE);
-
-	plan_tests(sizeof(flags) / sizeof(flags[0])
-		   * (3 + (1 + (MAX_SIZE/SIZE_STEP)) * 2) + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-record-expand.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		data.dsize = 0;
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		for (data.dsize = 0;
-		     data.dsize < MAX_SIZE;
-		     data.dsize += SIZE_STEP) {
-			memset(data.dptr, data.dsize, data.dsize);
-			ok1(tdb_store(tdb, key, data, TDB_MODIFY) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-		}
-		tdb_close(tdb);
-	}
-	ok1(tap_log_messages == 0);
-	free(data.dptr);
-
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-simple-delete.c b/lib/tdb2/test/api-simple-delete.c
deleted file mode 100644
index 48b077a6db..0000000000
--- a/lib/tdb2/test/api-simple-delete.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-simple-delete.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (tdb) {
-			/* Delete should fail. */
-			ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-			/* Insert should succeed. */
-			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-			/* Delete should now work. */
-			ok1(tdb_delete(tdb, key) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-			tdb_close(tdb);
-		}
-	}
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/api-summary.c b/lib/tdb2/test/api-summary.c
deleted file mode 100644
index e9dfd270e9..0000000000
--- a/lib/tdb2/test/api-summary.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "config.h"
-#include "tdb2.h"
-#include "tap-interface.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
-	struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
-	char *summary;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 2 * 5) + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-summary.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		/* Put some stuff in there. */
-		for (j = 0; j < 500; j++) {
-			/* Make sure padding varies to we get some graphs! */
-			data.dsize = j % (sizeof(j) + 1);
-			if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-				fail("Storing in tdb");
-		}
-
-		for (j = 0;
-		     j <= TDB_SUMMARY_HISTOGRAMS;
-		     j += TDB_SUMMARY_HISTOGRAMS) {
-			ok1(tdb_summary(tdb, j, &summary) == TDB_SUCCESS);
-			ok1(strstr(summary, "Number of records: 500\n"));
-			ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n"));
-			ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n"));
-			if (j == TDB_SUMMARY_HISTOGRAMS) {
-				ok1(strstr(summary, "|")
-				    && strstr(summary, "*"));
-			} else {
-				ok1(!strstr(summary, "|")
-				    && !strstr(summary, "*"));
-			}
-			free(summary);
-		}
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/external-agent.c b/lib/tdb2/test/external-agent.c
deleted file mode 100644
index e8cff95728..0000000000
--- a/lib/tdb2/test/external-agent.c
+++ /dev/null
@@ -1,252 +0,0 @@
-#include "external-agent.h"
-#include "logging.h"
-#include "lock-tracking.h"
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <ccan/err/err.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <limits.h>
-#include <string.h>
-#include <errno.h>
-#include "tap-interface.h"
-#include <stdio.h>
-#include <stdarg.h>
-
-static struct tdb_context *tdb;
-
-void (*external_agent_free)(void *) = free;
-
-static enum TDB_ERROR clear_if_first(int fd, void *arg)
-{
-/* We hold a lock offset 4 always, so we can tell if anyone is holding it.
- * (This is compatible with tdb1's TDB_CLEAR_IF_FIRST flag).  */
-	struct flock fl;
-
-	fl.l_type = F_WRLCK;
-	fl.l_whence = SEEK_SET;
-	fl.l_start = 4;
-	fl.l_len = 1;
-
-	if (fcntl(fd, F_SETLK, &fl) == 0) {
-		/* We must be first ones to open it! */
-		diag("agent truncating file!");
-		if (ftruncate(fd, 0) != 0) {
-			return TDB_ERR_IO;
-		}
-	}
-	fl.l_type = F_RDLCK;
-	if (fcntl(fd, F_SETLKW, &fl) != 0) {
-		return TDB_ERR_IO;
-	}
-	return TDB_SUCCESS;
-}
-
-static enum agent_return do_operation(enum operation op, const char *name)
-{
-	TDB_DATA k;
-	enum agent_return ret;
-	TDB_DATA data;
-	enum TDB_ERROR ecode;
-	union tdb_attribute cif;
-
-	if (op != OPEN && op != OPEN_WITH_HOOK && !tdb) {
-		diag("external: No tdb open!");
-		return OTHER_FAILURE;
-	}
-
-	diag("external: %s", operation_name(op));
-
-	k = tdb_mkdata(name, strlen(name));
-
-	locking_would_block = 0;
-	switch (op) {
-	case OPEN:
-		if (tdb) {
-			diag("Already have tdb %s open", tdb_name(tdb));
-			return OTHER_FAILURE;
-		}
-		tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &tap_log_attr);
-		if (!tdb) {
-			if (!locking_would_block)
-				diag("Opening tdb gave %s", strerror(errno));
-			forget_locking();
-			ret = OTHER_FAILURE;
-		} else
-			ret = SUCCESS;
-		break;
-	case OPEN_WITH_HOOK:
-		if (tdb) {
-			diag("Already have tdb %s open", tdb_name(tdb));
-			return OTHER_FAILURE;
-		}
-		cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
-		cif.openhook.base.next = &tap_log_attr;
-		cif.openhook.fn = clear_if_first;
-		tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &cif);
-		if (!tdb) {
-			if (!locking_would_block)
-				diag("Opening tdb gave %s", strerror(errno));
-			forget_locking();
-			ret = OTHER_FAILURE;
-		} else
-			ret = SUCCESS;
-		break;
-	case FETCH:
-		ecode = tdb_fetch(tdb, k, &data);
-		if (ecode == TDB_ERR_NOEXIST) {
-			ret = FAILED;
-		} else if (ecode < 0) {
-			ret = OTHER_FAILURE;
-		} else if (!tdb_deq(data, k)) {
-			ret = OTHER_FAILURE;
-			external_agent_free(data.dptr);
-		} else {
-			ret = SUCCESS;
-			external_agent_free(data.dptr);
-		}
-		break;
-	case STORE:
-		ret = tdb_store(tdb, k, k, 0) == 0 ? SUCCESS : OTHER_FAILURE;
-		break;
-	case TRANSACTION_START:
-		ret = tdb_transaction_start(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
-		break;
-	case TRANSACTION_COMMIT:
-		ret = tdb_transaction_commit(tdb)==0 ? SUCCESS : OTHER_FAILURE;
-		break;
-	case NEEDS_RECOVERY:
-		ret = external_agent_needs_rec(tdb);
-		break;
-	case CHECK:
-		ret = tdb_check(tdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE;
-		break;
-	case CLOSE:
-		ret = tdb_close(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
-		tdb = NULL;
-		break;
-	case SEND_SIGNAL:
-		/* We do this async */
-		ret = SUCCESS;
-		break;
-	default:
-		ret = OTHER_FAILURE;
-	}
-
-	if (locking_would_block)
-		ret = WOULD_HAVE_BLOCKED;
-
-	return ret;
-}
-
-struct agent {
-	int cmdfd, responsefd;
-};
-
-/* Do this before doing any tdb stuff.  Return handle, or NULL. */
-struct agent *prepare_external_agent(void)
-{
-	int pid, ret;
-	int command[2], response[2];
-	char name[1+PATH_MAX];
-
-	if (pipe(command) != 0 || pipe(response) != 0)
-		return NULL;
-
-	pid = fork();
-	if (pid < 0)
-		return NULL;
-
-	if (pid != 0) {
-		struct agent *agent = malloc(sizeof(*agent));
-
-		close(command[0]);
-		close(response[1]);
-		agent->cmdfd = command[1];
-		agent->responsefd = response[0];
-		return agent;
-	}
-
-	close(command[1]);
-	close(response[0]);
-
-	/* We want to fail, not block. */
-	nonblocking_locks = true;
-	log_prefix = "external: ";
-	while ((ret = read(command[0], name, sizeof(name))) > 0) {
-		enum agent_return result;
-
-		result = do_operation(name[0], name+1);
-		if (write(response[1], &result, sizeof(result))
-		    != sizeof(result))
-			err(1, "Writing response");
-		if (name[0] == SEND_SIGNAL) {
-			struct timeval ten_ms;
-			ten_ms.tv_sec = 0;
-			ten_ms.tv_usec = 10000;
-			select(0, NULL, NULL, NULL, &ten_ms);
-			kill(getppid(), SIGUSR1);
-		}
-	}
-	exit(0);
-}
-
-/* Ask the external agent to try to do an operation. */
-enum agent_return external_agent_operation(struct agent *agent,
-					   enum operation op,
-					   const char *name)
-{
-	enum agent_return res;
-	unsigned int len;
-	char *string;
-
-	if (!name)
-		name = "";
-	len = 1 + strlen(name) + 1;
-	string = malloc(len);
-
-	string[0] = op;
-	strcpy(string+1, name);
-
-	if (write(agent->cmdfd, string, len) != len
-	    || read(agent->responsefd, &res, sizeof(res)) != sizeof(res))
-		res = AGENT_DIED;
-
-	free(string);
-	return res;
-}
-
-const char *agent_return_name(enum agent_return ret)
-{
-	return ret == SUCCESS ? "SUCCESS"
-		: ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED"
-		: ret == AGENT_DIED ? "AGENT_DIED"
-		: ret == FAILED ? "FAILED"
-		: ret == OTHER_FAILURE ? "OTHER_FAILURE"
-		: "**INVALID**";
-}
-
-const char *operation_name(enum operation op)
-{
-	switch (op) {
-	case OPEN: return "OPEN";
-	case OPEN_WITH_HOOK: return "OPEN_WITH_HOOK";
-	case FETCH: return "FETCH";
-	case STORE: return "STORE";
-	case CHECK: return "CHECK";
-	case TRANSACTION_START: return "TRANSACTION_START";
-	case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT";
-	case NEEDS_RECOVERY: return "NEEDS_RECOVERY";
-	case SEND_SIGNAL: return "SEND_SIGNAL";
-	case CLOSE: return "CLOSE";
-	}
-	return "**INVALID**";
-}
-
-void free_external_agent(struct agent *agent)
-{
-	close(agent->cmdfd);
-	close(agent->responsefd);
-	free(agent);
-}
diff --git a/lib/tdb2/test/external-agent.h b/lib/tdb2/test/external-agent.h
deleted file mode 100644
index c4cd2b148d..0000000000
--- a/lib/tdb2/test/external-agent.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef TDB2_TEST_EXTERNAL_AGENT_H
-#define TDB2_TEST_EXTERNAL_AGENT_H
-
-/* For locking tests, we need a different process to try things at
- * various times. */
-enum operation {
-	OPEN,
-	OPEN_WITH_HOOK,
-	FETCH,
-	STORE,
-	TRANSACTION_START,
-	TRANSACTION_COMMIT,
-	NEEDS_RECOVERY,
-	CHECK,
-	SEND_SIGNAL,
-	CLOSE,
-};
-
-/* Do this before doing any tdb stuff.  Return handle, or -1. */
-struct agent *prepare_external_agent(void);
-
-enum agent_return {
-	SUCCESS,
-	WOULD_HAVE_BLOCKED,
-	AGENT_DIED,
-	FAILED, /* For fetch, or NEEDS_RECOVERY */
-	OTHER_FAILURE,
-};
-
-/* Ask the external agent to try to do an operation.
- * name == tdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST,
- * record name for FETCH/STORE (store stores name as data too)
- */
-enum agent_return external_agent_operation(struct agent *handle,
-					   enum operation op,
-					   const char *name);
-
-/* Hook into free() on tdb_data in external agent. */
-extern void (*external_agent_free)(void *);
-
-/* Mapping enum -> string. */
-const char *agent_return_name(enum agent_return ret);
-const char *operation_name(enum operation op);
-
-void free_external_agent(struct agent *agent);
-
-/* Internal use: */
-struct tdb_context;
-enum agent_return external_agent_needs_rec(struct tdb_context *tdb);
-
-#endif /* TDB2_TEST_EXTERNAL_AGENT_H */
diff --git a/lib/tdb2/test/failtest_helper.c b/lib/tdb2/test/failtest_helper.c
deleted file mode 100644
index 386f1c2379..0000000000
--- a/lib/tdb2/test/failtest_helper.c
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "failtest_helper.h"
-#include "logging.h"
-#include <string.h>
-#include "tap-interface.h"
-
-bool failtest_suppress = false;
-
-/* FIXME: From ccan/str */
-static inline bool strends(const char *str, const char *postfix)
-{
-	if (strlen(str) < strlen(postfix))
-		return false;
-
-	return !strcmp(str + strlen(str) - strlen(postfix), postfix);
-}
-
-bool failmatch(const struct failtest_call *call,
-	       const char *file, int line, enum failtest_call_type type)
-{
-	return call->type == type
-		&& call->line == line
-		&& ((strcmp(call->file, file) == 0)
-		    || (strends(call->file, file)
-			&& (call->file[strlen(call->file) - strlen(file) - 1]
-			    == '/')));
-}
-
-static bool is_nonblocking_lock(const struct failtest_call *call)
-{
-	return call->type == FAILTEST_FCNTL && call->u.fcntl.cmd == F_SETLK;
-}
-
-static bool is_unlock(const struct failtest_call *call)
-{
-	return call->type == FAILTEST_FCNTL
-		&& call->u.fcntl.arg.fl.l_type == F_UNLCK;
-}
-
-bool exit_check_log(struct tlist_calls *history)
-{
-	const struct failtest_call *i;
-
-	tlist_for_each(history, i, list) {
-		if (!i->fail)
-			continue;
-		/* Failing the /dev/urandom open doesn't count: we fall back. */
-		if (failmatch(i, URANDOM_OPEN))
-			continue;
-
-		/* Similarly with read fail. */
-		if (failmatch(i, URANDOM_READ))
-			continue;
-
-		/* Initial allocation of tdb doesn't log. */
-		if (failmatch(i, INITIAL_TDB_MALLOC))
-			continue;
-
-		/* We don't block "failures" on non-blocking locks. */
-		if (is_nonblocking_lock(i))
-			continue;
-
-		if (!tap_log_messages)
-			diag("We didn't log for %s:%u", i->file, i->line);
-		return tap_log_messages != 0;
-	}
-	return true;
-}
-
-/* Some places we soldier on despite errors: only fail them once. */
-enum failtest_result
-block_repeat_failures(struct tlist_calls *history)
-{
-	const struct failtest_call *last;
-
-	last = tlist_tail(history, list);
-
-	if (failtest_suppress)
-		return FAIL_DONT_FAIL;
-
-	if (failmatch(last, INITIAL_TDB_MALLOC)
-	    || failmatch(last, URANDOM_OPEN)
-	    || failmatch(last, URANDOM_READ)) {
-		return FAIL_PROBE;
-	}
-
-	/* We handle mmap failing, by falling back to read/write, so
-	 * don't try all possible paths. */
-	if (last->type == FAILTEST_MMAP)
-		return FAIL_PROBE;
-
-	/* Unlock or non-blocking lock is fail-once. */
-	if (is_unlock(last) || is_nonblocking_lock(last))
-		return FAIL_PROBE;
-
-	return FAIL_OK;
-}
diff --git a/lib/tdb2/test/failtest_helper.h b/lib/tdb2/test/failtest_helper.h
deleted file mode 100644
index 3c509e7c38..0000000000
--- a/lib/tdb2/test/failtest_helper.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef TDB2_TEST_FAILTEST_HELPER_H
-#define TDB2_TEST_FAILTEST_HELPER_H
-#include <ccan/failtest/failtest.h>
-#include <stdbool.h>
-
-/* FIXME: Check these! */
-#define INITIAL_TDB_MALLOC	"open.c", 403, FAILTEST_MALLOC
-#define URANDOM_OPEN		"open.c", 62, FAILTEST_OPEN
-#define URANDOM_READ		"open.c", 42, FAILTEST_READ
-
-bool exit_check_log(struct tlist_calls *history);
-bool failmatch(const struct failtest_call *call,
-	       const char *file, int line, enum failtest_call_type type);
-enum failtest_result block_repeat_failures(struct tlist_calls *history);
-
-/* Set this to suppress failure. */
-extern bool failtest_suppress;
-
-#endif /* TDB2_TEST_LOGGING_H */
diff --git a/lib/tdb2/test/helpapi-external-agent.c b/lib/tdb2/test/helpapi-external-agent.c
deleted file mode 100644
index 59e1c6cbee..0000000000
--- a/lib/tdb2/test/helpapi-external-agent.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "external-agent.h"
-
-/* This isn't possible with via the tdb2 API, but this makes it link. */
-enum agent_return external_agent_needs_rec(struct tdb_context *tdb)
-{
-	return FAILED;
-}
diff --git a/lib/tdb2/test/helprun-external-agent.c b/lib/tdb2/test/helprun-external-agent.c
deleted file mode 100644
index 9f243824fd..0000000000
--- a/lib/tdb2/test/helprun-external-agent.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "external-agent.h"
-#include "private.h"
-
-enum agent_return external_agent_needs_rec(struct tdb_context *tdb)
-{
-	return tdb_needs_recovery(tdb) ? SUCCESS : FAILED;
-}
diff --git a/lib/tdb2/test/helprun-layout.c b/lib/tdb2/test/helprun-layout.c
deleted file mode 100644
index b9cd4a6432..0000000000
--- a/lib/tdb2/test/helprun-layout.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/* TDB tools to create various canned database layouts. */
-#include "layout.h"
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <ccan/err/err.h>
-#include "logging.h"
-
-struct tdb_layout *new_tdb_layout(void)
-{
-	struct tdb_layout *layout = malloc(sizeof(*layout));
-	layout->num_elems = 0;
-	layout->elem = NULL;
-	return layout;
-}
-
-static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
-{
-	layout->elem = realloc(layout->elem,
-			       sizeof(layout->elem[0])
-			       * (layout->num_elems+1));
-	layout->elem[layout->num_elems++] = elem;
-}
-
-void tdb_layout_add_freetable(struct tdb_layout *layout)
-{
-	union tdb_layout_elem elem;
-	elem.base.type = FREETABLE;
-	add(layout, elem);
-}
-
-void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
-			 unsigned ftable)
-{
-	union tdb_layout_elem elem;
-	elem.base.type = FREE;
-	elem.free.len = len;
-	elem.free.ftable_num = ftable;
-	add(layout, elem);
-}
-
-void tdb_layout_add_capability(struct tdb_layout *layout,
-			       uint64_t type,
-			       bool write_breaks,
-			       bool check_breaks,
-			       bool open_breaks,
-			       tdb_len_t extra)
-{
-	union tdb_layout_elem elem;
-	elem.base.type = CAPABILITY;
-	elem.capability.type = type;
-	if (write_breaks)
-		elem.capability.type |= TDB_CAP_NOWRITE;
-	if (open_breaks)
-		elem.capability.type |= TDB_CAP_NOOPEN;
-	if (check_breaks)
-		elem.capability.type |= TDB_CAP_NOCHECK;
-	elem.capability.extra = extra;
-	add(layout, elem);
-}
-
-static struct tdb_data dup_key(struct tdb_data key)
-{
-	struct tdb_data ret;
-	ret.dsize = key.dsize;
-	ret.dptr = malloc(ret.dsize);
-	memcpy(ret.dptr, key.dptr, ret.dsize);
-	return ret;
-}
-
-void tdb_layout_add_used(struct tdb_layout *layout,
-			 TDB_DATA key, TDB_DATA data,
-			 tdb_len_t extra)
-{
-	union tdb_layout_elem elem;
-	elem.base.type = DATA;
-	elem.used.key = dup_key(key);
-	elem.used.data = dup_key(data);
-	elem.used.extra = extra;
-	add(layout, elem);
-}
-
-static tdb_len_t free_record_len(tdb_len_t len)
-{
-	return sizeof(struct tdb_used_record) + len;
-}
-
-static tdb_len_t data_record_len(struct tle_used *used)
-{
-	tdb_len_t len;
-	len = sizeof(struct tdb_used_record)
-		+ used->key.dsize + used->data.dsize + used->extra;
-	assert(len >= sizeof(struct tdb_free_record));
-	return len;
-}
-
-static tdb_len_t hashtable_len(struct tle_hashtable *htable)
-{
-	return sizeof(struct tdb_used_record)
-		+ (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
-		+ htable->extra;
-}
-
-static tdb_len_t capability_len(struct tle_capability *cap)
-{
-	return sizeof(struct tdb_capability) + cap->extra;
-}
-
-static tdb_len_t freetable_len(struct tle_freetable *ftable)
-{
-	return sizeof(struct tdb_freetable);
-}
-
-static void set_free_record(void *mem, tdb_len_t len)
-{
-	/* We do all the work in add_to_freetable */
-}
-
-static void add_zero_pad(struct tdb_used_record *u, size_t len, size_t extra)
-{
-	if (extra)
-		((char *)(u + 1))[len] = '\0';
-}
-
-static void set_data_record(void *mem, struct tdb_context *tdb,
-			    struct tle_used *used)
-{
-	struct tdb_used_record *u = mem;
-
-	set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize,
-		   used->key.dsize + used->data.dsize + used->extra,
-		   tdb_hash(tdb, used->key.dptr, used->key.dsize));
-	memcpy(u + 1, used->key.dptr, used->key.dsize);
-	memcpy((char *)(u + 1) + used->key.dsize,
-	       used->data.dptr, used->data.dsize);
-	add_zero_pad(u, used->key.dsize + used->data.dsize, used->extra);
-}
-
-static void set_hashtable(void *mem, struct tdb_context *tdb,
-			  struct tle_hashtable *htable)
-{
-	struct tdb_used_record *u = mem;
-	tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
-
-	set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
-	memset(u + 1, 0, len);
-	add_zero_pad(u, len, htable->extra);
-}
-
-static void set_capability(void *mem, struct tdb_context *tdb,
-			   struct tle_capability *cap, struct tdb_header *hdr,
-			   tdb_off_t last_cap)
-{
-	struct tdb_capability *c = mem;
-	tdb_len_t len = sizeof(*c) - sizeof(struct tdb_used_record) + cap->extra;
-
-	c->type = cap->type;
-	c->next = 0;
-	set_header(tdb, &c->hdr, TDB_CAP_MAGIC, 0, len, len, 0);
-
-	/* Append to capability list. */
-	if (!last_cap) {
-		hdr->capabilities = cap->base.off;
-	} else {
-		c = (struct tdb_capability *)((char *)hdr + last_cap);
-		c->next = cap->base.off;
-	}
-}
-
-static void set_freetable(void *mem, struct tdb_context *tdb,
-			 struct tle_freetable *freetable, struct tdb_header *hdr,
-			 tdb_off_t last_ftable)
-{
-	struct tdb_freetable *ftable = mem;
-	memset(ftable, 0, sizeof(*ftable));
-	set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0,
-			sizeof(*ftable) - sizeof(ftable->hdr),
-			sizeof(*ftable) - sizeof(ftable->hdr), 0);
-
-	if (last_ftable) {
-		ftable = (struct tdb_freetable *)((char *)hdr + last_ftable);
-		ftable->next = freetable->base.off;
-	} else {
-		hdr->free_table = freetable->base.off;
-	}
-}
-
-static void add_to_freetable(struct tdb_context *tdb,
-			     tdb_off_t eoff,
-			     tdb_off_t elen,
-			     unsigned ftable,
-			     struct tle_freetable *freetable)
-{
-	tdb->ftable_off = freetable->base.off;
-	tdb->ftable = ftable;
-	add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen,
-			TDB_LOCK_WAIT, false);
-}
-
-static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned ingroup)
-{
-	return group_start
-		+ (ingroup % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
-}
-
-/* Get bits from a value. */
-static uint32_t bits(uint64_t val, unsigned start, unsigned num)
-{
-	assert(num <= 32);
-	return (val >> start) & ((1U << num) - 1);
-}
-
-/* We take bits from the top: that way we can lock whole sections of the hash
- * by using lock ranges. */
-static uint32_t use_bits(uint64_t h, unsigned num, unsigned *used)
-{
-	*used += num;
-	return bits(h, 64 - *used, num);
-}
-
-static tdb_off_t encode_offset(tdb_off_t new_off, unsigned bucket,
-			       uint64_t h)
-{
-	return bucket
-		| new_off
-		| ((uint64_t)bits(h, 64 - TDB_OFF_UPPER_STEAL_EXTRA,
-				  TDB_OFF_UPPER_STEAL_EXTRA)
-		   << TDB_OFF_HASH_EXTRA_BIT);
-}
-
-/* FIXME: Our hash table handling here is primitive: we don't expand! */
-static void add_to_hashtable(struct tdb_context *tdb,
-			     tdb_off_t eoff,
-			     struct tdb_data key)
-{
-	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
-	tdb_off_t b_off, group_start;
-	unsigned i, group, in_group;
-	unsigned used = 0;
-
-	group = use_bits(h, TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, &used);
-	in_group = use_bits(h, TDB_HASH_GROUP_BITS, &used);
-
-	group_start = offsetof(struct tdb_header, hashtable)
-		+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
-
-	for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
-		unsigned bucket = (in_group + i) % (1 << TDB_HASH_GROUP_BITS);
-
-		b_off = hbucket_off(group_start, bucket);
-		if (tdb_read_off(tdb, b_off) == 0) {
-			tdb_write_off(tdb, b_off,
-				      encode_offset(eoff, in_group, h));
-			return;
-		}
-	}
-	abort();
-}
-
-static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num)
-{
-	unsigned i;
-
-	for (i = 0; i < layout->num_elems; i++) {
-		if (layout->elem[i].base.type != FREETABLE)
-			continue;
-		if (num == 0)
-			return &layout->elem[i].ftable;
-		num--;
-	}
-	abort();
-}
-
-/* FIXME: Support TDB_CONVERT */
-struct tdb_context *tdb_layout_get(struct tdb_layout *layout,
-				   void (*freefn)(void *),
-				   union tdb_attribute *attr)
-{
-	unsigned int i;
-	tdb_off_t off, len, last_ftable, last_cap;
-	char *mem;
-	struct tdb_context *tdb;
-
-	off = sizeof(struct tdb_header);
-
-	/* First pass of layout: calc lengths */
-	for (i = 0; i < layout->num_elems; i++) {
-		union tdb_layout_elem *e = &layout->elem[i];
-		e->base.off = off;
-		switch (e->base.type) {
-		case FREETABLE:
-			len = freetable_len(&e->ftable);
-			break;
-		case FREE:
-			len = free_record_len(e->free.len);
-			break;
-		case DATA:
-			len = data_record_len(&e->used);
-			break;
-		case HASHTABLE:
-			len = hashtable_len(&e->hashtable);
-			break;
-		case CAPABILITY:
-			len = capability_len(&e->capability);
-			break;
-		default:
-			abort();
-		}
-		off += len;
-	}
-
-	mem = malloc(off);
-	/* Fill with some weird pattern. */
-	memset(mem, 0x99, off);
-	/* Now populate our header, cribbing from a real TDB header. */
-	tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, attr);
-	memcpy(mem, tdb->file->map_ptr, sizeof(struct tdb_header));
-
-	/* Mug the tdb we have to make it use this. */
-	freefn(tdb->file->map_ptr);
-	tdb->file->map_ptr = mem;
-	tdb->file->map_size = off;
-
-	last_ftable = 0;
-	last_cap = 0;
-	for (i = 0; i < layout->num_elems; i++) {
-		union tdb_layout_elem *e = &layout->elem[i];
-		switch (e->base.type) {
-		case FREETABLE:
-			set_freetable(mem + e->base.off, tdb, &e->ftable,
-				     (struct tdb_header *)mem, last_ftable);
-			last_ftable = e->base.off;
-			break;
-		case FREE:
-			set_free_record(mem + e->base.off, e->free.len);
-			break;
-		case DATA:
-			set_data_record(mem + e->base.off, tdb, &e->used);
-			break;
-		case HASHTABLE:
-			set_hashtable(mem + e->base.off, tdb, &e->hashtable);
-			break;
-		case CAPABILITY:
-			set_capability(mem + e->base.off, tdb, &e->capability,
-				       (struct tdb_header *)mem, last_cap);
-			last_cap = e->base.off;
-			break;
-		}
-	}
-	/* Must have a free table! */
-	assert(last_ftable);
-
-	/* Now fill the free and hash tables. */
-	for (i = 0; i < layout->num_elems; i++) {
-		union tdb_layout_elem *e = &layout->elem[i];
-		switch (e->base.type) {
-		case FREE:
-			add_to_freetable(tdb, e->base.off, e->free.len,
-					 e->free.ftable_num,
-					 find_ftable(layout, e->free.ftable_num));
-			break;
-		case DATA:
-			add_to_hashtable(tdb, e->base.off, e->used.key);
-			break;
-		default:
-			break;
-		}
-	}
-
-	tdb->ftable_off = find_ftable(layout, 0)->base.off;
-	return tdb;
-}
-
-void tdb_layout_write(struct tdb_layout *layout, void (*freefn)(void *),
-		       union tdb_attribute *attr, const char *filename)
-{
-	struct tdb_context *tdb = tdb_layout_get(layout, freefn, attr);
-	int fd;
-
-	fd = open(filename, O_WRONLY|O_TRUNC|O_CREAT,  0600);
-	if (fd < 0)
-		err(1, "opening %s for writing", filename);
-	if (write(fd, tdb->file->map_ptr, tdb->file->map_size)
-	    != tdb->file->map_size)
-		err(1, "writing %s", filename);
-	close(fd);
-	tdb_close(tdb);
-}
-
-void tdb_layout_free(struct tdb_layout *layout)
-{
-	unsigned int i;
-
-	for (i = 0; i < layout->num_elems; i++) {
-		if (layout->elem[i].base.type == DATA) {
-			free(layout->elem[i].used.key.dptr);
-			free(layout->elem[i].used.data.dptr);
-		}
-	}
-	free(layout->elem);
-	free(layout);
-}
diff --git a/lib/tdb2/test/layout.h b/lib/tdb2/test/layout.h
deleted file mode 100644
index 3aadf20ee2..0000000000
--- a/lib/tdb2/test/layout.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef TDB2_TEST_LAYOUT_H
-#define TDB2_TEST_LAYOUT_H
-#include "private.h"
-
-struct tdb_layout *new_tdb_layout(void);
-void tdb_layout_add_freetable(struct tdb_layout *layout);
-void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
-			 unsigned ftable);
-void tdb_layout_add_used(struct tdb_layout *layout,
-			 TDB_DATA key, TDB_DATA data,
-			 tdb_len_t extra);
-void tdb_layout_add_capability(struct tdb_layout *layout,
-			       uint64_t type,
-			       bool write_breaks,
-			       bool check_breaks,
-			       bool open_breaks,
-			       tdb_len_t extra);
-
-#if 0 /* FIXME: Allow allocation of subtables */
-void tdb_layout_add_hashtable(struct tdb_layout *layout,
-			      int htable_parent, /* -1 == toplevel */
-			      unsigned int bucket,
-			      tdb_len_t extra);
-#endif
-/* freefn is needed if we're using failtest_free. */
-struct tdb_context *tdb_layout_get(struct tdb_layout *layout,
-				   void (*freefn)(void *),
-				   union tdb_attribute *attr);
-void tdb_layout_write(struct tdb_layout *layout, void (*freefn)(void *),
-		       union tdb_attribute *attr, const char *filename);
-
-void tdb_layout_free(struct tdb_layout *layout);
-
-enum layout_type {
-	FREETABLE, FREE, DATA, HASHTABLE, CAPABILITY
-};
-
-/* Shared by all union members. */
-struct tle_base {
-	enum layout_type type;
-	tdb_off_t off;
-};
-
-struct tle_freetable {
-	struct tle_base base;
-};
-
-struct tle_free {
-	struct tle_base base;
-	tdb_len_t len;
-	unsigned ftable_num;
-};
-
-struct tle_used {
-	struct tle_base base;
-	TDB_DATA key;
-	TDB_DATA data;
-	tdb_len_t extra;
-};
-
-struct tle_hashtable {
-	struct tle_base base;
-	int parent;
-	unsigned int bucket;
-	tdb_len_t extra;
-};
-
-struct tle_capability {
-	struct tle_base base;
-	uint64_t type;
-	tdb_len_t extra;
-};
-
-union tdb_layout_elem {
-	struct tle_base base;
-	struct tle_freetable ftable;
-	struct tle_free free;
-	struct tle_used used;
-	struct tle_hashtable hashtable;
-	struct tle_capability capability;
-};
-
-struct tdb_layout {
-	unsigned int num_elems;
-	union tdb_layout_elem *elem;
-};
-#endif /* TDB2_TEST_LAYOUT_H */
diff --git a/lib/tdb2/test/lock-tracking.c b/lib/tdb2/test/lock-tracking.c
deleted file mode 100644
index c7387ead99..0000000000
--- a/lib/tdb2/test/lock-tracking.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/* We save the locks so we can reaquire them. */
-#include "private.h" /* For TDB_HASH_LOCK_START, etc. */
-#include <unistd.h>
-#include <fcntl.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include "tap-interface.h"
-#include "lock-tracking.h"
-
-struct lock {
-	struct lock *next;
-	unsigned int off;
-	unsigned int len;
-	int type;
-};
-static struct lock *locks;
-int locking_errors = 0;
-bool suppress_lockcheck = false;
-bool nonblocking_locks;
-int locking_would_block = 0;
-void (*unlock_callback)(int fd);
-
-int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ )
-{
-	va_list ap;
-	int ret, arg3;
-	struct flock *fl;
-	bool may_block = false;
-
-	if (cmd != F_SETLK && cmd != F_SETLKW) {
-		/* This may be totally bogus, but we don't know in general. */
-		va_start(ap, cmd);
-		arg3 = va_arg(ap, int);
-		va_end(ap);
-
-		return fcntl(fd, cmd, arg3);
-	}
-
-	va_start(ap, cmd);
-	fl = va_arg(ap, struct flock *);
-	va_end(ap);
-
-	if (cmd == F_SETLKW && nonblocking_locks) {
-		cmd = F_SETLK;
-		may_block = true;
-	}
-	ret = fcntl(fd, cmd, fl);
-
-	/* Detect when we failed, but might have been OK if we waited. */
-	if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) {
-		locking_would_block++;
-	}
-
-	if (fl->l_type == F_UNLCK) {
-		struct lock **l;
-		struct lock *old = NULL;
-
-		for (l = &locks; *l; l = &(*l)->next) {
-			if ((*l)->off == fl->l_start
-			    && (*l)->len == fl->l_len) {
-				if (ret == 0) {
-					old = *l;
-					*l = (*l)->next;
-					free(old);
-				}
-				break;
-			}
-		}
-		if (!old && !suppress_lockcheck) {
-			diag("Unknown unlock %u@%u - %i",
-			     (int)fl->l_len, (int)fl->l_start, ret);
-			locking_errors++;
-		}
-	} else {
-		struct lock *new, *i;
-		unsigned int fl_end = fl->l_start + fl->l_len;
-		if (fl->l_len == 0)
-			fl_end = (unsigned int)-1;
-
-		/* Check for overlaps: we shouldn't do this. */
-		for (i = locks; i; i = i->next) {
-			unsigned int i_end = i->off + i->len;
-			if (i->len == 0)
-				i_end = (unsigned int)-1;
-
-			if (fl->l_start >= i->off && fl->l_start < i_end)
-				break;
-			if (fl_end > i->off && fl_end < i_end)
-				break;
-
-			/* tdb_allrecord_lock does this, handle adjacent: */
-			if (fl->l_start > TDB_HASH_LOCK_START
-			    && fl->l_start == i_end && fl->l_type == i->type) {
-				if (ret == 0) {
-					i->len = fl->l_len
-						? i->len + fl->l_len
-						: 0;
-				}
-				goto done;
-			}
-		}
-		if (i) {
-			/* Special case: upgrade of allrecord lock. */
-			if (i->type == F_RDLCK && fl->l_type == F_WRLCK
-			    && i->off == TDB_HASH_LOCK_START
-			    && fl->l_start == TDB_HASH_LOCK_START
-			    && i->len == 0
-			    && fl->l_len == 0) {
-				if (ret == 0)
-					i->type = F_WRLCK;
-				goto done;
-			}
-			if (!suppress_lockcheck) {
-				diag("%s lock %u@%u overlaps %u@%u",
-				     fl->l_type == F_WRLCK ? "write" : "read",
-				     (int)fl->l_len, (int)fl->l_start,
-				     i->len, (int)i->off);
-				locking_errors++;
-			}
-		}
-
-		if (ret == 0) {
-			new = malloc(sizeof *new);
-			new->off = fl->l_start;
-			new->len = fl->l_len;
-			new->type = fl->l_type;
-			new->next = locks;
-			locks = new;
-		}
-	}
-done:
-	if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback)
-		unlock_callback(fd);
-	return ret;
-}
-
-unsigned int forget_locking(void)
-{
-	unsigned int num = 0;
-	while (locks) {
-		struct lock *next = locks->next;
-		free(locks);
-		locks = next;
-		num++;
-	}
-	return num;
-}
diff --git a/lib/tdb2/test/lock-tracking.h b/lib/tdb2/test/lock-tracking.h
deleted file mode 100644
index f2c9c44653..0000000000
--- a/lib/tdb2/test/lock-tracking.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef LOCK_TRACKING_H
-#define LOCK_TRACKING_H
-#include <stdbool.h>
-
-/* Set this if you want a callback after fnctl unlock. */
-extern void (*unlock_callback)(int fd);
-
-/* Replacement fcntl. */
-int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ );
-
-/* Discard locking info: returns number of locks outstanding. */
-unsigned int forget_locking(void);
-
-/* Number of errors in locking. */
-extern int locking_errors;
-
-/* Suppress lock checking. */
-extern bool suppress_lockcheck;
-
-/* Make all locks non-blocking. */
-extern bool nonblocking_locks;
-
-/* Number of times we failed a lock because we made it non-blocking. */
-extern int locking_would_block;
-#endif /* LOCK_TRACKING_H */
diff --git a/lib/tdb2/test/logging.c b/lib/tdb2/test/logging.c
deleted file mode 100644
index 86fc152bab..0000000000
--- a/lib/tdb2/test/logging.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "tap-interface.h"
-#include "logging.h"
-
-unsigned tap_log_messages;
-const char *log_prefix = "";
-char *log_last = NULL;
-bool suppress_logging;
-
-union tdb_attribute tap_log_attr = {
-	.log = { .base = { .attr = TDB_ATTRIBUTE_LOG },
-		 .fn = tap_log_fn }
-};
-
-void tap_log_fn(struct tdb_context *tdb,
-		enum tdb_log_level level,
-		enum TDB_ERROR ecode,
-		const char *message, void *priv)
-{
-	if (suppress_logging)
-		return;
-
-	diag("tdb log level %u: %s: %s%s",
-	     level, tdb_errorstr(ecode), log_prefix, message);
-	if (log_last)
-		free(log_last);
-	log_last = strdup(message);
-	tap_log_messages++;
-}
diff --git a/lib/tdb2/test/logging.h b/lib/tdb2/test/logging.h
deleted file mode 100644
index 5f517dc592..0000000000
--- a/lib/tdb2/test/logging.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef TDB2_TEST_LOGGING_H
-#define TDB2_TEST_LOGGING_H
-#include "tdb2.h"
-#include <stdbool.h>
-#include <string.h>
-
-extern bool suppress_logging;
-extern const char *log_prefix;
-extern unsigned tap_log_messages;
-extern union tdb_attribute tap_log_attr;
-extern char *log_last;
-
-void tap_log_fn(struct tdb_context *tdb,
-		enum tdb_log_level level,
-		enum TDB_ERROR ecode,
-		const char *message, void *priv);
-#endif /* TDB2_TEST_LOGGING_H */
diff --git a/lib/tdb2/test/run-001-encode.c b/lib/tdb2/test/run-001-encode.c
deleted file mode 100644
index 9657eb79d0..0000000000
--- a/lib/tdb2/test/run-001-encode.c
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_used_record rec;
-	struct tdb_context tdb = { .log_fn = tap_log_fn };
-
-	plan_tests(64 + 32 + 48*6 + 1);
-
-	/* We should be able to encode any data value. */
-	for (i = 0; i < 64; i++)
-		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i,
-			       1ULL << i, 0) == 0);
-
-	/* And any key and data with < 64 bits between them. */
-	for (i = 0; i < 32; i++) {
-		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
-		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
-			       klen + dlen, 0)  == 0);
-	}
-
-	/* We should neatly encode all values. */
-	for (i = 0; i < 48; i++) {
-		uint64_t h = 1ULL << (i < 5 ? i : 4);
-		uint64_t klen = 1ULL << (i < 16 ? i : 15);
-		uint64_t dlen = 1ULL << i;
-		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
-		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
-			       klen+dlen+xlen, h) == 0);
-		ok1(rec_key_length(&rec) == klen);
-		ok1(rec_data_length(&rec) == dlen);
-		ok1(rec_extra_padding(&rec) == xlen);
-		ok1((uint64_t)rec_hash(&rec) == h);
-		ok1(rec_magic(&rec) == TDB_USED_MAGIC);
-	}
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-001-fls.c b/lib/tdb2/test/run-001-fls.c
deleted file mode 100644
index 792adbf655..0000000000
--- a/lib/tdb2/test/run-001-fls.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-
-static unsigned int dumb_fls(uint64_t num)
-{
-	int i;
-
-	for (i = 63; i >= 0; i--) {
-		if (num & (1ULL << i))
-			break;
-	}
-	return i + 1;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-
-	plan_tests(64 * 64 + 2);
-
-	ok1(fls64(0) == 0);
-	ok1(dumb_fls(0) == 0);
-
-	for (i = 0; i < 64; i++) {
-		for (j = 0; j < 64; j++) {
-			uint64_t val = (1ULL << i) | (1ULL << j);
-			ok(fls64(val) == dumb_fls(val),
-			   "%llu -> %u should be %u", (long long)val,
-			   fls64(val), dumb_fls(val));
-		}
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-01-new_database.c b/lib/tdb2/test/run-01-new_database.c
deleted file mode 100644
index 00c15140df..0000000000
--- a/lib/tdb2/test/run-01-new_database.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	failtest_init(argc, argv);
-	failtest_hook = block_repeat_failures;
-	failtest_exit_check = exit_check_log;
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-new_database.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			failtest_exit(exit_status());
-
-		failtest_suppress = true;
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		failtest_suppress = false;
-		tdb_close(tdb);
-		if (!ok1(tap_log_messages == 0))
-			break;
-	}
-	failtest_exit(exit_status());
-}
diff --git a/lib/tdb2/test/run-02-expand.c b/lib/tdb2/test/run-02-expand.c
deleted file mode 100644
index fd1ae4be34..0000000000
--- a/lib/tdb2/test/run-02-expand.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	uint64_t val;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11 + 1);
-
-	failtest_init(argc, argv);
-	failtest_hook = block_repeat_failures;
-	failtest_exit_check = exit_check_log;
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		failtest_suppress = true;
-		tdb = tdb_open("run-expand.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			break;
-
-		val = tdb->file->map_size;
-		/* Need some hash lock for expand. */
-		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
-		failtest_suppress = false;
-		if (!ok1(tdb_expand(tdb, 1) == 0)) {
-			failtest_suppress = true;
-			tdb_close(tdb);
-			break;
-		}
-		failtest_suppress = true;
-
-		ok1(tdb->file->map_size >= val + 1 * TDB_EXTENSION_FACTOR);
-		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		val = tdb->file->map_size;
-		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
-		failtest_suppress = false;
-		if (!ok1(tdb_expand(tdb, 1024) == 0)) {
-			failtest_suppress = true;
-			tdb_close(tdb);
-			break;
-		}
-		failtest_suppress = true;
-		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
-		ok1(tdb->file->map_size >= val + 1024 * TDB_EXTENSION_FACTOR);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	failtest_exit(exit_status());
-}
diff --git a/lib/tdb2/test/run-03-coalesce.c b/lib/tdb2/test/run-03-coalesce.c
deleted file mode 100644
index ecc469fa32..0000000000
--- a/lib/tdb2/test/run-03-coalesce.c
+++ /dev/null
@@ -1,178 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-#include "layout.h"
-
-static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
-{
-	struct tdb_free_record f;
-	enum TDB_ERROR ecode;
-
-	ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
-	if (ecode != TDB_SUCCESS)
-		return ecode;
-	if (frec_magic(&f) != TDB_FREE_MAGIC)
-		return TDB_ERR_CORRUPT;
-	return frec_len(&f);
-}
-
-int main(int argc, char *argv[])
-{
-	tdb_off_t b_off, test;
-	struct tdb_context *tdb;
-	struct tdb_layout *layout;
-	struct tdb_data data, key;
-	tdb_len_t len;
-
-	/* FIXME: Test TDB_CONVERT */
-	/* FIXME: Test lock order fail. */
-
-	plan_tests(42);
-	data = tdb_mkdata("world", 5);
-	key = tdb_mkdata("hello", 5);
-
-	/* No coalescing can be done due to EOF */
-	layout = new_tdb_layout();
-	tdb_layout_add_freetable(layout);
-	len = 1024;
-	tdb_layout_add_free(layout, len, 0);
-	tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
-	/* NOMMAP is for lockcheck. */
-	tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
-		       &tap_log_attr);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
-
-	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
-	/* Lock and fail to coalesce. */
-	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-	test = layout->elem[1].base.off;
-	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len, &test)
-	    == 0);
-	tdb_unlock_free_bucket(tdb, b_off);
-	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
-	ok1(test == layout->elem[1].base.off);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-	tdb_close(tdb);
-	tdb_layout_free(layout);
-
-	/* No coalescing can be done due to used record */
-	layout = new_tdb_layout();
-	tdb_layout_add_freetable(layout);
-	tdb_layout_add_free(layout, 1024, 0);
-	tdb_layout_add_used(layout, key, data, 6);
-	tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
-	/* NOMMAP is for lockcheck. */
-	tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
-		       &tap_log_attr);
-	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
-	/* Lock and fail to coalesce. */
-	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-	test = layout->elem[1].base.off;
-	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
-	    == 0);
-	tdb_unlock_free_bucket(tdb, b_off);
-	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
-	ok1(test == layout->elem[1].base.off);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-	tdb_close(tdb);
-	tdb_layout_free(layout);
-
-	/* Coalescing can be done due to two free records, then EOF */
-	layout = new_tdb_layout();
-	tdb_layout_add_freetable(layout);
-	tdb_layout_add_free(layout, 1024, 0);
-	tdb_layout_add_free(layout, 2048, 0);
-	tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
-	/* NOMMAP is for lockcheck. */
-	tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
-		       &tap_log_attr);
-	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
-	ok1(free_record_length(tdb, layout->elem[2].base.off) == 2048);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Figure out which bucket (first) free entry is. */
-	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
-	/* Lock and coalesce. */
-	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-	test = layout->elem[2].base.off;
-	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
-	    == 1024 + sizeof(struct tdb_used_record) + 2048);
-	/* Should tell us it's erased this one... */
-	ok1(test == TDB_ERR_NOEXIST);
-	ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
-	ok1(free_record_length(tdb, layout->elem[1].base.off)
-	    == 1024 + sizeof(struct tdb_used_record) + 2048);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-	tdb_close(tdb);
-	tdb_layout_free(layout);
-
-	/* Coalescing can be done due to two free records, then data */
-	layout = new_tdb_layout();
-	tdb_layout_add_freetable(layout);
-	tdb_layout_add_free(layout, 1024, 0);
-	tdb_layout_add_free(layout, 512, 0);
-	tdb_layout_add_used(layout, key, data, 6);
-	tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
-	/* NOMMAP is for lockcheck. */
-	tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
-		       &tap_log_attr);
-	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
-	ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
-	/* Lock and coalesce. */
-	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-	test = layout->elem[2].base.off;
-	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
-	    == 1024 + sizeof(struct tdb_used_record) + 512);
-	ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
-	ok1(free_record_length(tdb, layout->elem[1].base.off)
-	    == 1024 + sizeof(struct tdb_used_record) + 512);
-	ok1(test == TDB_ERR_NOEXIST);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-	tdb_close(tdb);
-	tdb_layout_free(layout);
-
-	/* Coalescing can be done due to three free records, then EOF */
-	layout = new_tdb_layout();
-	tdb_layout_add_freetable(layout);
-	tdb_layout_add_free(layout, 1024, 0);
-	tdb_layout_add_free(layout, 512, 0);
-	tdb_layout_add_free(layout, 256, 0);
-	tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb");
-	/* NOMMAP is for lockcheck. */
-	tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0,
-		       &tap_log_attr);
-	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
-	ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
-	ok1(free_record_length(tdb, layout->elem[3].base.off) == 256);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
-	/* Lock and coalesce. */
-	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-	test = layout->elem[2].base.off;
-	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
-	    == 1024 + sizeof(struct tdb_used_record) + 512
-	    + sizeof(struct tdb_used_record) + 256);
-	ok1(tdb->file->allrecord_lock.count == 0
-	    && tdb->file->num_lockrecs == 0);
-	ok1(free_record_length(tdb, layout->elem[1].base.off)
-	    == 1024 + sizeof(struct tdb_used_record) + 512
-	    + sizeof(struct tdb_used_record) + 256);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-	tdb_close(tdb);
-	tdb_layout_free(layout);
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-04-basichash.c b/lib/tdb2/test/run-04-basichash.c
deleted file mode 100644
index dc75fc72dc..0000000000
--- a/lib/tdb2/test/run-04-basichash.c
+++ /dev/null
@@ -1,260 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-/* We rig the hash so adjacent-numbered records always clash. */
-static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
-{
-	return ((uint64_t)*(const unsigned int *)key)
-		<< (64 - TDB_TOPLEVEL_HASH_BITS - 1);
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	unsigned int v;
-	struct tdb_used_record rec;
-	struct tdb_data key = { (unsigned char *)&v, sizeof(v) };
-	struct tdb_data dbuf = { (unsigned char *)&v, sizeof(v) };
-	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
-						.fn = clash } };
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT,
-	};
-
-	hattr.base.next = &tap_log_attr;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0])
-		   * (91 + (2 * ((1 << TDB_HASH_GROUP_BITS) - 1))) + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		struct hash_info h;
-		tdb_off_t new_off, off, subhash;
-
-		tdb = tdb_open("run-04-basichash.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		v = 0;
-		/* Should not find it. */
-		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
-		/* Should have created correct hash. */
-		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-		/* Should have located space in group 0, bucket 0. */
-		ok1(h.group_start == offsetof(struct tdb_header, hashtable));
-		ok1(h.home_bucket == 0);
-		ok1(h.found_bucket == 0);
-		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
-
-		/* Should have lock on bucket 0 */
-		ok1(h.hlock_start == 0);
-		ok1(h.hlock_range ==
-		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
-		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
-		ok1((tdb->flags & TDB_NOLOCK)
-		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
-		/* FIXME: Check lock length */
-
-		/* Allocate a new record. */
-		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
-				TDB_USED_MAGIC, false);
-		ok1(!TDB_OFF_IS_ERR(new_off));
-
-		/* We should be able to add it now. */
-		ok1(add_to_hash(tdb, &h, new_off) == 0);
-
-		/* Make sure we fill it in for later finding. */
-		off = new_off + sizeof(struct tdb_used_record);
-		ok1(!tdb->io->twrite(tdb, off, key.dptr, key.dsize));
-		off += key.dsize;
-		ok1(!tdb->io->twrite(tdb, off, dbuf.dptr, dbuf.dsize));
-
-		/* We should be able to unlock that OK. */
-		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-				      F_WRLCK) == 0);
-
-		/* Database should be consistent. */
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Now, this should give a successful lookup. */
-		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
-		    == new_off);
-		/* Should have created correct hash. */
-		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-		/* Should have located space in group 0, bucket 0. */
-		ok1(h.group_start == offsetof(struct tdb_header, hashtable));
-		ok1(h.home_bucket == 0);
-		ok1(h.found_bucket == 0);
-		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
-
-		/* Should have lock on bucket 0 */
-		ok1(h.hlock_start == 0);
-		ok1(h.hlock_range ==
-		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
-		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
-		ok1((tdb->flags & TDB_NOLOCK)
-		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
-		/* FIXME: Check lock length */
-
-		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-				      F_WRLCK) == 0);
-
-		/* Database should be consistent. */
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Test expansion. */
-		v = 1;
-		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
-		/* Should have created correct hash. */
-		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-		/* Should have located space in group 0, bucket 1. */
-		ok1(h.group_start == offsetof(struct tdb_header, hashtable));
-		ok1(h.home_bucket == 0);
-		ok1(h.found_bucket == 1);
-		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
-
-		/* Should have lock on bucket 0 */
-		ok1(h.hlock_start == 0);
-		ok1(h.hlock_range ==
-		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
-		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
-		ok1((tdb->flags & TDB_NOLOCK)
-		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
-		/* FIXME: Check lock length */
-
-		/* Make it expand 0'th bucket. */
-		ok1(expand_group(tdb, &h) == 0);
-		/* First one should be subhash, next should be empty. */
-		ok1(is_subhash(h.group[0]));
-		subhash = (h.group[0] & TDB_OFF_MASK);
-		for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++)
-			ok1(h.group[j] == 0);
-
-		ok1(tdb_write_convert(tdb, h.group_start,
-				      h.group, sizeof(h.group)) == 0);
-		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-				      F_WRLCK) == 0);
-
-		/* Should be happy with expansion. */
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Should be able to find it. */
-		v = 0;
-		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
-		    == new_off);
-		/* Should have created correct hash. */
-		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-		/* Should have located space in expanded group 0, bucket 0. */
-		ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
-		ok1(h.home_bucket == 0);
-		ok1(h.found_bucket == 0);
-		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
-		    + TDB_SUBLEVEL_HASH_BITS);
-
-		/* Should have lock on bucket 0 */
-		ok1(h.hlock_start == 0);
-		ok1(h.hlock_range ==
-		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
-		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
-		ok1((tdb->flags & TDB_NOLOCK)
-		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
-		/* FIXME: Check lock length */
-
-		/* Simple delete should work. */
-		ok1(delete_from_hash(tdb, &h) == 0);
-		ok1(add_free_record(tdb, new_off,
-				    sizeof(struct tdb_used_record)
-				    + rec_key_length(&rec)
-				    + rec_data_length(&rec)
-				    + rec_extra_padding(&rec),
-				    TDB_LOCK_NOWAIT, false) == 0);
-		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-				      F_WRLCK) == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Test second-level expansion: should expand 0th bucket. */
-		v = 0;
-		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
-		/* Should have created correct hash. */
-		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-		/* Should have located space in group 0, bucket 0. */
-		ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
-		ok1(h.home_bucket == 0);
-		ok1(h.found_bucket == 0);
-		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS+TDB_SUBLEVEL_HASH_BITS);
-
-		/* Should have lock on bucket 0 */
-		ok1(h.hlock_start == 0);
-		ok1(h.hlock_range ==
-		    1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
-		ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
-		ok1((tdb->flags & TDB_NOLOCK)
-		    || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
-		/* FIXME: Check lock length */
-
-		ok1(expand_group(tdb, &h) == 0);
-		/* First one should be subhash, next should be empty. */
-		ok1(is_subhash(h.group[0]));
-		subhash = (h.group[0] & TDB_OFF_MASK);
-		for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++)
-			ok1(h.group[j] == 0);
-		ok1(tdb_write_convert(tdb, h.group_start,
-				      h.group, sizeof(h.group)) == 0);
-		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-				      F_WRLCK) == 0);
-
-		/* Should be happy with expansion. */
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
-		/* Should have created correct hash. */
-		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-		/* Should have located space in group 0, bucket 0. */
-		ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
-		ok1(h.home_bucket == 0);
-		ok1(h.found_bucket == 0);
-		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
-		    + TDB_SUBLEVEL_HASH_BITS * 2);
-
-		/* We should be able to add it now. */
-		/* Allocate a new record. */
-		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
-				TDB_USED_MAGIC, false);
-		ok1(!TDB_OFF_IS_ERR(new_off));
-		ok1(add_to_hash(tdb, &h, new_off) == 0);
-
-		/* Make sure we fill it in for later finding. */
-		off = new_off + sizeof(struct tdb_used_record);
-		ok1(!tdb->io->twrite(tdb, off, key.dptr, key.dsize));
-		off += key.dsize;
-		ok1(!tdb->io->twrite(tdb, off, dbuf.dptr, dbuf.dsize));
-
-		/* We should be able to unlock that OK. */
-		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-				      F_WRLCK) == 0);
-
-		/* Database should be consistent. */
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Should be able to find it. */
-		v = 0;
-		ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
-		    == new_off);
-		/* Should have created correct hash. */
-		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-		/* Should have located space in expanded group 0, bucket 0. */
-		ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
-		ok1(h.home_bucket == 0);
-		ok1(h.found_bucket == 0);
-		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
-		    + TDB_SUBLEVEL_HASH_BITS * 2);
-
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-05-readonly-open.c b/lib/tdb2/test/run-05-readonly-open.c
deleted file mode 100644
index 1046a8b47e..0000000000
--- a/lib/tdb2/test/run-05-readonly-open.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4), d;
-	union tdb_attribute seed_attr;
-	unsigned int msgs = 0;
-
-	failtest_init(argc, argv);
-	failtest_hook = block_repeat_failures;
-	failtest_exit_check = exit_check_log;
-
-	seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
-	seed_attr.base.next = &tap_log_attr;
-	seed_attr.seed.seed = 0;
-
-	failtest_suppress = true;
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-05-readonly-open.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600,
-			       &seed_attr);
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-		tdb_close(tdb);
-
-		failtest_suppress = false;
-		tdb = tdb_open("run-05-readonly-open.tdb", flags[i],
-			       O_RDONLY, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			break;
-		ok1(tap_log_messages == msgs);
-		/* Fetch should succeed, stores should fail. */
-		if (!ok1(tdb_fetch(tdb, key, &d) == 0))
-			goto fail;
-		ok1(tdb_deq(d, data));
-		free(d.dptr);
-		if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY)
-			 == TDB_ERR_RDONLY))
-			goto fail;
-		ok1(tap_log_messages == ++msgs);
-		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT)
-			 == TDB_ERR_RDONLY))
-			goto fail;
-		ok1(tap_log_messages == ++msgs);
-		failtest_suppress = true;
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		tdb_close(tdb);
-		ok1(tap_log_messages == msgs);
-		/* SIGH: failtest bug, it doesn't save the tdb file because
-		 * we have it read-only.  If we go around again, it gets
-		 * changed underneath us and things get screwy. */
-		if (failtest_has_failed())
-			break;
-	}
-	failtest_exit(exit_status());
-
-fail:
-	failtest_suppress = true;
-	tdb_close(tdb);
-	failtest_exit(exit_status());
-}
diff --git a/lib/tdb2/test/run-10-simple-store.c b/lib/tdb2/test/run-10-simple-store.c
deleted file mode 100644
index 66bf6a6a51..0000000000
--- a/lib/tdb2/test/run-10-simple-store.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-
-	failtest_init(argc, argv);
-	failtest_hook = block_repeat_failures;
-	failtest_exit_check = exit_check_log;
-
-	failtest_suppress = true;
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-10-simple-store.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			break;
-		/* Modify should fail. */
-		failtest_suppress = false;
-		if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY)
-			 == TDB_ERR_NOEXIST))
-			goto fail;
-		failtest_suppress = true;
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		/* Insert should succeed. */
-		failtest_suppress = false;
-		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0))
-			goto fail;
-		failtest_suppress = true;
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		/* Second insert should fail. */
-		failtest_suppress = false;
-		if (!ok1(tdb_store(tdb, key, data, TDB_INSERT)
-			 == TDB_ERR_EXISTS))
-			goto fail;
-		failtest_suppress = true;
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		tdb_close(tdb);
-	}
-	ok1(tap_log_messages == 0);
-	failtest_exit(exit_status());
-
-fail:
-	failtest_suppress = true;
-	tdb_close(tdb);
-	failtest_exit(exit_status());
-}
diff --git a/lib/tdb2/test/run-11-simple-fetch.c b/lib/tdb2/test/run-11-simple-fetch.c
deleted file mode 100644
index 4c41ceec6d..0000000000
--- a/lib/tdb2/test/run-11-simple-fetch.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-
-	failtest_init(argc, argv);
-	failtest_hook = block_repeat_failures;
-	failtest_exit_check = exit_check_log;
-
-	failtest_suppress = true;
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-11-simple-fetch.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (tdb) {
-			struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
-
-			/* fetch should fail. */
-			failtest_suppress = false;
-			if (!ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST))
-				goto fail;
-			failtest_suppress = true;
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-			/* Insert should succeed. */
-			ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-			/* Fetch should now work. */
-			failtest_suppress = false;
-			if (!ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS))
-				goto fail;
-			failtest_suppress = true;
-			ok1(tdb_deq(d, data));
-			free(d.dptr);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-			tdb_close(tdb);
-		}
-	}
-	ok1(tap_log_messages == 0);
-	failtest_exit(exit_status());
-
-fail:
-	failtest_suppress = true;
-	tdb_close(tdb);
-	failtest_exit(exit_status());
-}
diff --git a/lib/tdb2/test/run-12-check.c b/lib/tdb2/test/run-12-check.c
deleted file mode 100644
index cc57726f93..0000000000
--- a/lib/tdb2/test/run-12-check.c
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "private.h"
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL,
-			TDB_INTERNAL|TDB_CONVERT,
-			TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-
-	failtest_init(argc, argv);
-	failtest_hook = block_repeat_failures;
-	failtest_exit_check = exit_check_log;
-
-	failtest_suppress = true;
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-12-check.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-
-		/* This is what we really want to test: tdb_check(). */
-		failtest_suppress = false;
-		if (!ok1(tdb_check(tdb, NULL, NULL) == 0))
-			goto fail;
-		failtest_suppress = true;
-
-		tdb_close(tdb);
-	}
-	ok1(tap_log_messages == 0);
-	failtest_exit(exit_status());
-
-fail:
-	failtest_suppress = true;
-	tdb_close(tdb);
-	failtest_exit(exit_status());
-}
diff --git a/lib/tdb2/test/run-15-append.c b/lib/tdb2/test/run-15-append.c
deleted file mode 100644
index 6578b70734..0000000000
--- a/lib/tdb2/test/run-15-append.c
+++ /dev/null
@@ -1,130 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/ilog/ilog.h>
-#include "logging.h"
-
-#define MAX_SIZE 13100
-#define SIZE_STEP 131
-
-static tdb_off_t tdb_offset(struct tdb_context *tdb, struct tdb_data key)
-{
-	tdb_off_t off;
-	struct tdb_used_record urec;
-	struct hash_info h;
-
-	off = find_and_lock(tdb, key, F_RDLCK, &h, &urec, NULL);
-	if (TDB_OFF_IS_ERR(off))
-		return 0;
-	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
-	return off;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j, moves;
-	struct tdb_context *tdb;
-	unsigned char *buffer;
-	tdb_off_t oldoff = 0, newoff;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data;
-
-	buffer = malloc(MAX_SIZE);
-	for (i = 0; i < MAX_SIZE; i++)
-		buffer[i] = i;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0])
-		   * ((3 + MAX_SIZE/SIZE_STEP * 5) * 2 + 7)
-		   + 1);
-
-	/* Using tdb_store. */
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-append.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		moves = 0;
-		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
-			data.dptr = buffer;
-			data.dsize = j;
-			ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-			ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
-			ok1(data.dsize == j);
-			ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
-			free(data.dptr);
-			newoff = tdb_offset(tdb, key);
-			if (newoff != oldoff)
-				moves++;
-			oldoff = newoff;
-		}
-		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
-				   && tdb->file->num_lockrecs == 0));
-		/* We should increase by 50% each time... */
-		ok(moves <= ilog64(j / SIZE_STEP)*2,
-		   "Moved %u times", moves);
-		tdb_close(tdb);
-	}
-
-	/* Using tdb_append. */
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		size_t prev_len = 0;
-		tdb = tdb_open("run-append.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		moves = 0;
-		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
-			data.dptr = buffer + prev_len;
-			data.dsize = j - prev_len;
-			ok1(tdb_append(tdb, key, data) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-			ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
-			ok1(data.dsize == j);
-			ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
-			free(data.dptr);
-			prev_len = data.dsize;
-			newoff = tdb_offset(tdb, key);
-			if (newoff != oldoff)
-				moves++;
-			oldoff = newoff;
-		}
-		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
-				   && tdb->file->num_lockrecs == 0));
-		/* We should increase by 50% each time... */
-		ok(moves <= ilog64(j / SIZE_STEP)*2,
-		   "Moved %u times", moves);
-		tdb_close(tdb);
-	}
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-append.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		/* Huge initial store. */
-		data.dptr = buffer;
-		data.dsize = MAX_SIZE;
-		ok1(tdb_append(tdb, key, data) == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
-		ok1(data.dsize == MAX_SIZE);
-		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
-		free(data.dptr);
-		ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
-				   && tdb->file->num_lockrecs == 0));
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	free(buffer);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-20-growhash.c b/lib/tdb2/test/run-20-growhash.c
deleted file mode 100644
index 2f634a27c0..0000000000
--- a/lib/tdb2/test/run-20-growhash.c
+++ /dev/null
@@ -1,137 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-static uint64_t myhash(const void *key, size_t len, uint64_t seed, void *priv)
-{
-	return *(const uint64_t *)key;
-}
-
-static void add_bits(uint64_t *val, unsigned new, unsigned new_bits,
-		     unsigned *done)
-{
-	*done += new_bits;
-	*val |= ((uint64_t)new << (64 - *done));
-}
-
-static uint64_t make_key(unsigned topgroup, unsigned topbucket,
-			 unsigned subgroup1, unsigned subbucket1,
-			 unsigned subgroup2, unsigned subbucket2)
-{
-	uint64_t key = 0;
-	unsigned done = 0;
-
-	add_bits(&key, topgroup, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
-		 &done);
-	add_bits(&key, topbucket, TDB_HASH_GROUP_BITS, &done);
-	add_bits(&key, subgroup1, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
-		 &done);
-	add_bits(&key, subbucket1, TDB_HASH_GROUP_BITS, &done);
-	add_bits(&key, subgroup2, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
-		 &done);
-	add_bits(&key, subbucket2, TDB_HASH_GROUP_BITS, &done);
-	return key;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	uint64_t kdata;
-	struct tdb_used_record rec;
-	struct tdb_data key = { (unsigned char *)&kdata, sizeof(kdata) };
-	struct tdb_data dbuf = { (unsigned char *)&kdata, sizeof(kdata) };
-	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
-						.fn = myhash } };
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT,
-	};
-
-	hattr.base.next = &tap_log_attr;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0])
-		   * (9 + (20 + 2 * ((1 << TDB_HASH_GROUP_BITS) - 2))
-		      * (1 << TDB_HASH_GROUP_BITS)) + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		struct hash_info h;
-
-		tdb = tdb_open("run-20-growhash.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		/* Fill a group. */
-		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
-			kdata = make_key(0, j, 0, 0, 0, 0);
-			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
-		}
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Check first still exists. */
-		kdata = make_key(0, 0, 0, 0, 0, 0);
-		ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL) != 0);
-		/* Should have created correct hash. */
-		ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-		/* Should have located space in group 0, bucket 0. */
-		ok1(h.group_start == offsetof(struct tdb_header, hashtable));
-		ok1(h.home_bucket == 0);
-		ok1(h.found_bucket == 0);
-		ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
-		/* Entire group should be full! */
-		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
-			ok1(h.group[j] != 0);
-
-		ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-				      F_RDLCK) == 0);
-
-		/* Now, add one more to each should expand (that) bucket. */
-		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
-			unsigned int k;
-			kdata = make_key(0, j, 0, 1, 0, 0);
-			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-			ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL));
-			/* Should have created correct hash. */
-			ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-			/* Should have moved to subhash */
-			ok1(h.group_start >= sizeof(struct tdb_header));
-			ok1(h.home_bucket == 1);
-			ok1(h.found_bucket == 1);
-			ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
-			    + TDB_SUBLEVEL_HASH_BITS);
-			ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-					      F_RDLCK) == 0);
-
-			/* Keep adding, make it expand again. */
-			for (k = 2; k < (1 << TDB_HASH_GROUP_BITS); k++) {
-				kdata = make_key(0, j, 0, k, 0, 0);
-				ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
-				ok1(tdb_check(tdb, NULL, NULL) == 0);
-			}
-
-			/* This should tip it over to sub-sub-hash. */
-			kdata = make_key(0, j, 0, 0, 0, 1);
-			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
-			ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-			ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL));
-			/* Should have created correct hash. */
-			ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
-			/* Should have moved to subhash */
-			ok1(h.group_start >= sizeof(struct tdb_header));
-			ok1(h.home_bucket == 1);
-			ok1(h.found_bucket == 1);
-			ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
-			    + TDB_SUBLEVEL_HASH_BITS + TDB_SUBLEVEL_HASH_BITS);
-			ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
-					      F_RDLCK) == 0);
-		}
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-25-hashoverload.c b/lib/tdb2/test/run-25-hashoverload.c
deleted file mode 100644
index 850321554a..0000000000
--- a/lib/tdb2/test/run-25-hashoverload.c
+++ /dev/null
@@ -1,113 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
-{
-	return 0;
-}
-
-static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
-{
-	if (p)
-		return tdb_delete(tdb, key);
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
-	struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) };
-	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
-						.fn = badhash } };
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT,
-	};
-
-	hattr.base.next = &tap_log_attr;
-
-	plan_tests(6883);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
-
-		tdb = tdb_open("run-25-hashoverload.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		/* Fill a group. */
-		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
-			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
-		}
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Now store one last value: should form chain. */
-		ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Check we can find them all. */
-		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) {
-			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-			ok1(d.dsize == sizeof(j));
-			ok1(d.dptr != NULL);
-			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
-			free(d.dptr);
-		}
-
-		/* Now add a *lot* more. */
-		for (j = (1 << TDB_HASH_GROUP_BITS) + 1;
-		     j < (16 << TDB_HASH_GROUP_BITS);
-		     j++) {
-			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
-			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-			ok1(d.dsize == sizeof(j));
-			ok1(d.dptr != NULL);
-			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
-			free(d.dptr);
-		}
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Traverse through them. */
-		ok1(tdb_traverse(tdb, trav, NULL) == j);
-
-		/* Empty the first chain-worth. */
-		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
-			ok1(tdb_delete(tdb, key) == 0);
-
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		for (j = (1 << TDB_HASH_GROUP_BITS);
-		     j < (16 << TDB_HASH_GROUP_BITS);
-		     j++) {
-			ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
-			ok1(d.dsize == sizeof(j));
-			ok1(d.dptr != NULL);
-			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
-			free(d.dptr);
-		}
-
-		/* Traverse through them. */
-		ok1(tdb_traverse(tdb, trav, NULL)
-		    == (15 << TDB_HASH_GROUP_BITS));
-
-		/* Re-add */
-		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
-			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
-		}
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Now try deleting as we go. */
-		ok1(tdb_traverse(tdb, trav, trav)
-		    == (16 << TDB_HASH_GROUP_BITS));
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(tdb_traverse(tdb, trav, NULL) == 0);
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-30-exhaust-before-expand.c b/lib/tdb2/test/run-30-exhaust-before-expand.c
deleted file mode 100644
index 13bb9461d4..0000000000
--- a/lib/tdb2/test/run-30-exhaust-before-expand.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-static bool empty_freetable(struct tdb_context *tdb)
-{
-	struct tdb_freetable ftab;
-	unsigned int i;
-
-	/* Now, free table should be completely exhausted in zone 0 */
-	if (tdb_read_convert(tdb, tdb->ftable_off, &ftab, sizeof(ftab)) != 0)
-		abort();
-
-	for (i = 0; i < sizeof(ftab.buckets)/sizeof(ftab.buckets[0]); i++) {
-		if (ftab.buckets[i])
-			return false;
-	}
-	return true;
-}
-
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 9 + 1);
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		TDB_DATA k;
-		uint64_t size;
-		bool was_empty = false;
-
-		k.dptr = (void *)&j;
-		k.dsize = sizeof(j);
-
-		tdb = tdb_open("run-30-exhaust-before-expand.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		ok1(empty_freetable(tdb));
-		/* Need some hash lock for expand. */
-		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
-		/* Create some free space. */
-		ok1(tdb_expand(tdb, 1) == 0);
-		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(!empty_freetable(tdb));
-
-		size = tdb->file->map_size;
-		/* Insert minimal-length records until we expand. */
-		for (j = 0; tdb->file->map_size == size; j++) {
-			was_empty = empty_freetable(tdb);
-			if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
-				err(1, "Failed to store record %i", j);
-		}
-
-		/* Would have been empty before expansion, but no longer. */
-		ok1(was_empty);
-		ok1(!empty_freetable(tdb));
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-35-convert.c b/lib/tdb2/test/run-35-convert.c
deleted file mode 100644
index ac7939591b..0000000000
--- a/lib/tdb2/test/run-35-convert.c
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "private.h"
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <ccan/failtest/failtest.h>
-#include "logging.h"
-#include "failtest_helper.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, messages = 0;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-
-	failtest_init(argc, argv);
-	failtest_hook = block_repeat_failures;
-	failtest_exit_check = exit_check_log;
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-35-convert.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		if (!ok1(tdb))
-			failtest_exit(exit_status());
-
-		tdb_close(tdb);
-		/* If we say TDB_CONVERT, it must be converted */
-		tdb = tdb_open("run-35-convert.tdb",
-			       flags[i]|TDB_CONVERT,
-			       O_RDWR, 0600, &tap_log_attr);
-		if (flags[i] & TDB_CONVERT) {
-			if (!tdb)
-				failtest_exit(exit_status());
-			ok1(tdb_get_flags(tdb) & TDB_CONVERT);
-			tdb_close(tdb);
-		} else {
-			if (!ok1(!tdb && errno == EIO))
-				failtest_exit(exit_status());
-			ok1(tap_log_messages == ++messages);
-			if (!ok1(log_last && strstr(log_last, "TDB_CONVERT")))
-				failtest_exit(exit_status());
-		}
-
-		/* If don't say TDB_CONVERT, it *may* be converted */
-		tdb = tdb_open("run-35-convert.tdb",
-			       flags[i] & ~TDB_CONVERT,
-			       O_RDWR, 0600, &tap_log_attr);
-		if (!tdb)
-			failtest_exit(exit_status());
-		ok1(tdb_get_flags(tdb) == flags[i]);
-		tdb_close(tdb);
-	}
-	failtest_exit(exit_status());
-}
diff --git a/lib/tdb2/test/run-50-multiple-freelists.c b/lib/tdb2/test/run-50-multiple-freelists.c
deleted file mode 100644
index b102876c8d..0000000000
--- a/lib/tdb2/test/run-50-multiple-freelists.c
+++ /dev/null
@@ -1,70 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-#include "layout.h"
-
-int main(int argc, char *argv[])
-{
-	tdb_off_t off;
-	struct tdb_context *tdb;
-	struct tdb_layout *layout;
-	TDB_DATA key, data;
-	union tdb_attribute seed;
-
-	/* This seed value previously tickled a layout.c bug. */
-	seed.base.attr = TDB_ATTRIBUTE_SEED;
-	seed.seed.seed = 0xb1142bc054d035b4ULL;
-	seed.base.next = &tap_log_attr;
-
-	plan_tests(11);
-	key = tdb_mkdata("Hello", 5);
-	data = tdb_mkdata("world", 5);
-
-	/* Create a TDB with three free tables. */
-	layout = new_tdb_layout();
-	tdb_layout_add_freetable(layout);
-	tdb_layout_add_freetable(layout);
-	tdb_layout_add_freetable(layout);
-	tdb_layout_add_free(layout, 80, 0);
-	/* Used record prevent coalescing. */
-	tdb_layout_add_used(layout, key, data, 6);
-	tdb_layout_add_free(layout, 160, 1);
-	key.dsize--;
-	tdb_layout_add_used(layout, key, data, 7);
-	tdb_layout_add_free(layout, 320, 2);
-	key.dsize--;
-	tdb_layout_add_used(layout, key, data, 8);
-	tdb_layout_add_free(layout, 40, 0);
-	tdb = tdb_layout_get(layout, free, &seed);
-	ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0,
-		       TDB_USED_MAGIC, 0);
-	ok1(off == layout->elem[3].base.off);
-	ok1(tdb->ftable_off == layout->elem[0].base.off);
-
-	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0,
-		       TDB_USED_MAGIC, 0);
-	ok1(off == layout->elem[5].base.off);
-	ok1(tdb->ftable_off == layout->elem[1].base.off);
-
-	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0,
-		       TDB_USED_MAGIC, 0);
-	ok1(off == layout->elem[7].base.off);
-	ok1(tdb->ftable_off == layout->elem[2].base.off);
-
-	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0,
-		       TDB_USED_MAGIC, 0);
-	ok1(off == layout->elem[9].base.off);
-	ok1(tdb->ftable_off == layout->elem[0].base.off);
-
-	/* Now we fail. */
-	off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0);
-	ok1(off == 0);
-
-	tdb_close(tdb);
-	tdb_layout_free(layout);
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-56-open-during-transaction.c b/lib/tdb2/test/run-56-open-during-transaction.c
deleted file mode 100644
index c514caa92b..0000000000
--- a/lib/tdb2/test/run-56-open-during-transaction.c
+++ /dev/null
@@ -1,165 +0,0 @@
-#include "private.h"
-#include <unistd.h>
-#include "lock-tracking.h"
-
-static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
-static ssize_t write_check(int fd, const void *buf, size_t count);
-static int ftruncate_check(int fd, off_t length);
-
-#define pwrite pwrite_check
-#define write write_check
-#define fcntl fcntl_with_lockcheck
-#define ftruncate ftruncate_check
-
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdarg.h>
-#include "external-agent.h"
-#include "logging.h"
-
-static struct agent *agent;
-static bool opened;
-static int errors = 0;
-#define TEST_DBNAME "run-56-open-during-transaction.tdb"
-
-#undef write
-#undef pwrite
-#undef fcntl
-#undef ftruncate
-
-static bool is_same(const char *snapshot, const char *latest, off_t len)
-{
-	unsigned i;
-
-	for (i = 0; i < len; i++) {
-		if (snapshot[i] != latest[i])
-			return false;
-	}
-	return true;
-}
-
-static bool compare_file(int fd, const char *snapshot, off_t snapshot_len)
-{
-	char *contents;
-	bool ret;
-
-	/* over-length read serves as length check. */
-	contents = malloc(snapshot_len+1);
-	ret = pread(fd, contents, snapshot_len+1, 0) == snapshot_len
-		&& is_same(snapshot, contents, snapshot_len);
-	free(contents);
-	return ret;
-}
-
-static void check_file_intact(int fd)
-{
-	enum agent_return ret;
-	struct stat st;
-	char *contents;
-
-	fstat(fd, &st);
-	contents = malloc(st.st_size);
-	if (pread(fd, contents, st.st_size, 0) != st.st_size) {
-		diag("Read fail");
-		errors++;
-		return;
-	}
-
-	/* Ask agent to open file. */
-	ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
-
-	/* It's OK to open it, but it must not have changed! */
-	if (!compare_file(fd, contents, st.st_size)) {
-		diag("Agent changed file after opening %s",
-		     agent_return_name(ret));
-		errors++;
-	}
-
-	if (ret == SUCCESS) {
-		ret = external_agent_operation(agent, CLOSE, NULL);
-		if (ret != SUCCESS) {
-			diag("Agent failed to close tdb: %s",
-			     agent_return_name(ret));
-			errors++;
-		}
-	} else if (ret != WOULD_HAVE_BLOCKED) {
-		diag("Agent opening file gave %s",
-		     agent_return_name(ret));
-		errors++;
-	}
-
-	free(contents);
-}
-
-static void after_unlock(int fd)
-{
-	if (opened)
-		check_file_intact(fd);
-}
-
-static ssize_t pwrite_check(int fd,
-			    const void *buf, size_t count, off_t offset)
-{
-	if (opened)
-		check_file_intact(fd);
-
-	return pwrite(fd, buf, count, offset);
-}
-
-static ssize_t write_check(int fd, const void *buf, size_t count)
-{
-	if (opened)
-		check_file_intact(fd);
-
-	return write(fd, buf, count);
-}
-
-static int ftruncate_check(int fd, off_t length)
-{
-	if (opened)
-		check_file_intact(fd);
-
-	return ftruncate(fd, length);
-
-}
-
-int main(int argc, char *argv[])
-{
-	const int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	int i;
-	struct tdb_context *tdb;
-	TDB_DATA key, data;
-
-	plan_tests(sizeof(flags)/sizeof(flags[0]) * 5);
-	agent = prepare_external_agent();
-	if (!agent)
-		err(1, "preparing agent");
-
-	unlock_callback = after_unlock;
-	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
-		diag("Test with %s and %s\n",
-		     (flags[i] & TDB_CONVERT) ? "CONVERT" : "DEFAULT",
-		     (flags[i] & TDB_NOMMAP) ? "no mmap" : "mmap");
-		unlink(TEST_DBNAME);
-		tdb = tdb_open(TEST_DBNAME, flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-
-		opened = true;
-		ok1(tdb_transaction_start(tdb) == 0);
-		key = tdb_mkdata("hi", strlen("hi"));
-		data = tdb_mkdata("world", strlen("world"));
-
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-		ok1(tdb_transaction_commit(tdb) == 0);
-		ok(!errors, "We had %u open errors", errors);
-
-		opened = false;
-		tdb_close(tdb);
-	}
-
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-57-die-during-transaction.c b/lib/tdb2/test/run-57-die-during-transaction.c
deleted file mode 100644
index ee33a896ff..0000000000
--- a/lib/tdb2/test/run-57-die-during-transaction.c
+++ /dev/null
@@ -1,293 +0,0 @@
-#include "private.h"
-#include <unistd.h>
-#include "lock-tracking.h"
-#include "tap-interface.h"
-#include <stdlib.h>
-#include <assert.h>
-static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
-static ssize_t write_check(int fd, const void *buf, size_t count);
-static int ftruncate_check(int fd, off_t length);
-
-#define pwrite pwrite_check
-#define write write_check
-#define fcntl fcntl_with_lockcheck
-#define ftruncate ftruncate_check
-
-/* There's a malloc inside transaction_setup_recovery, and valgrind complains
- * when we longjmp and leak it. */
-#define MAX_ALLOCATIONS 10
-static void *allocated[MAX_ALLOCATIONS];
-static unsigned max_alloc = 0;
-
-static void *malloc_noleak(size_t len)
-{
-	unsigned int i;
-
-	for (i = 0; i < MAX_ALLOCATIONS; i++)
-		if (!allocated[i]) {
-			allocated[i] = malloc(len);
-			if (i > max_alloc) {
-				max_alloc = i;
-				diag("max_alloc: %i", max_alloc);
-			}
-			return allocated[i];
-		}
-	diag("Too many allocations!");
-	abort();
-}
-
-static void *realloc_noleak(void *p, size_t size)
-{
-	unsigned int i;
-
-	for (i = 0; i < MAX_ALLOCATIONS; i++) {
-		if (allocated[i] == p) {
-			if (i > max_alloc) {
-				max_alloc = i;
-				diag("max_alloc: %i", max_alloc);
-			}
-			return allocated[i] = realloc(p, size);
-		}
-	}
-	diag("Untracked realloc!");
-	abort();
-}
-
-static void free_noleak(void *p)
-{
-	unsigned int i;
-
-	/* We don't catch asprintf, so don't complain if we miss one. */
-	for (i = 0; i < MAX_ALLOCATIONS; i++) {
-		if (allocated[i] == p) {
-			allocated[i] = NULL;
-			break;
-		}
-	}
-	free(p);
-}
-
-static void free_all(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < MAX_ALLOCATIONS; i++) {
-		free(allocated[i]);
-		allocated[i] = NULL;
-	}
-}
-
-#define malloc malloc_noleak
-#define free free_noleak
-#define realloc realloc_noleak
-
-#include "tdb2-source.h"
-
-#undef malloc
-#undef free
-#undef realloc
-#undef write
-#undef pwrite
-#undef fcntl
-#undef ftruncate
-
-#include <stdbool.h>
-#include <stdarg.h>
-#include <setjmp.h>
-#include "external-agent.h"
-#include "logging.h"
-
-static bool in_transaction;
-static int target, current;
-static jmp_buf jmpbuf;
-#define TEST_DBNAME "run-57-die-during-transaction.tdb"
-#define KEY_STRING "helloworld"
-
-static void maybe_die(int fd)
-{
-	if (in_transaction && current++ == target) {
-		longjmp(jmpbuf, 1);
-	}
-}
-
-static ssize_t pwrite_check(int fd,
-			    const void *buf, size_t count, off_t offset)
-{
-	ssize_t ret;
-
-	maybe_die(fd);
-
-	ret = pwrite(fd, buf, count, offset);
-	if (ret != count)
-		return ret;
-
-	maybe_die(fd);
-	return ret;
-}
-
-static ssize_t write_check(int fd, const void *buf, size_t count)
-{
-	ssize_t ret;
-
-	maybe_die(fd);
-
-	ret = write(fd, buf, count);
-	if (ret != count)
-		return ret;
-
-	maybe_die(fd);
-	return ret;
-}
-
-static int ftruncate_check(int fd, off_t length)
-{
-	int ret;
-
-	maybe_die(fd);
-
-	ret = ftruncate(fd, length);
-
-	maybe_die(fd);
-	return ret;
-}
-
-static bool test_death(enum operation op, struct agent *agent)
-{
-	struct tdb_context *tdb = NULL;
-	TDB_DATA key;
-	enum agent_return ret;
-	int needed_recovery = 0;
-
-	current = target = 0;
-reset:
-	unlink(TEST_DBNAME);
-	tdb = tdb_open(TEST_DBNAME, TDB_NOMMAP,
-		       O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr);
-	if (!tdb) {
-		diag("Failed opening TDB: %s", strerror(errno));
-		return false;
-	}
-
-	if (setjmp(jmpbuf) != 0) {
-		/* We're partway through.  Simulate our death. */
-		close(tdb->file->fd);
-		forget_locking();
-		in_transaction = false;
-
-		ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
-		if (ret == SUCCESS)
-			needed_recovery++;
-		else if (ret != FAILED) {
-			diag("Step %u agent NEEDS_RECOVERY = %s", current,
-			     agent_return_name(ret));
-			return false;
-		}
-
-		ret = external_agent_operation(agent, op, KEY_STRING);
-		if (ret != SUCCESS) {
-			diag("Step %u op %s failed = %s", current,
-			     operation_name(op),
-			     agent_return_name(ret));
-			return false;
-		}
-
-		ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
-		if (ret != FAILED) {
-			diag("Still needs recovery after step %u = %s",
-			     current, agent_return_name(ret));
-			return false;
-		}
-
-		ret = external_agent_operation(agent, CHECK, "");
-		if (ret != SUCCESS) {
-			diag("Step %u check failed = %s", current,
-			     agent_return_name(ret));
-			return false;
-		}
-
-		ret = external_agent_operation(agent, CLOSE, "");
-		if (ret != SUCCESS) {
-			diag("Step %u close failed = %s", current,
-			     agent_return_name(ret));
-			return false;
-		}
-
-		/* Suppress logging as this tries to use closed fd. */
-		suppress_logging = true;
-		suppress_lockcheck = true;
-		tdb_close(tdb);
-		suppress_logging = false;
-		suppress_lockcheck = false;
-		target++;
-		current = 0;
-		free_all();
-		goto reset;
-	}
-
-	/* Put key for agent to fetch. */
-	key = tdb_mkdata(KEY_STRING, strlen(KEY_STRING));
-	if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
-		return false;
-
-	/* This is the key we insert in transaction. */
-	key.dsize--;
-
-	ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
-	if (ret != SUCCESS)
-		errx(1, "Agent failed to open: %s", agent_return_name(ret));
-
-	ret = external_agent_operation(agent, FETCH, KEY_STRING);
-	if (ret != SUCCESS)
-		errx(1, "Agent failed find key: %s", agent_return_name(ret));
-
-	in_transaction = true;
-	if (tdb_transaction_start(tdb) != 0)
-		return false;
-
-	if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
-		return false;
-
-	if (tdb_transaction_commit(tdb) != 0)
-		return false;
-
-	in_transaction = false;
-
-	/* We made it! */
-	diag("Completed %u runs", current);
-	tdb_close(tdb);
-	ret = external_agent_operation(agent, CLOSE, "");
-	if (ret != SUCCESS) {
-		diag("Step %u close failed = %s", current,
-		     agent_return_name(ret));
-		return false;
-	}
-
-	ok1(needed_recovery);
-	ok1(locking_errors == 0);
-	ok1(forget_locking() == 0);
-	locking_errors = 0;
-	return true;
-}
-
-int main(int argc, char *argv[])
-{
-	enum operation ops[] = { FETCH, STORE, TRANSACTION_START };
-	struct agent *agent;
-	int i;
-
-	plan_tests(12);
-	unlock_callback = maybe_die;
-
-	external_agent_free = free_noleak;
-	agent = prepare_external_agent();
-	if (!agent)
-		err(1, "preparing agent");
-
-	for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) {
-		diag("Testing %s after death", operation_name(ops[i]));
-		ok1(test_death(ops[i], agent));
-	}
-
-	free_external_agent(agent);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-64-bit-tdb.c b/lib/tdb2/test/run-64-bit-tdb.c
deleted file mode 100644
index ef6e243a05..0000000000
--- a/lib/tdb2/test/run-64-bit-tdb.c
+++ /dev/null
@@ -1,72 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	if (sizeof(off_t) <= 4) {
-		plan_tests(1);
-		pass("No 64 bit off_t");
-		return exit_status();
-	}
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		off_t old_size;
-		TDB_DATA k, d;
-		struct hash_info h;
-		struct tdb_used_record rec;
-		tdb_off_t off;
-
-		tdb = tdb_open("run-64-bit-tdb.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		old_size = tdb->file->map_size;
-
-		/* This makes a sparse file */
-		ok1(ftruncate(tdb->file->fd, 0xFFFFFFF0) == 0);
-		ok1(add_free_record(tdb, old_size, 0xFFFFFFF0 - old_size,
-				    TDB_LOCK_WAIT, false) == TDB_SUCCESS);
-
-		/* Now add a little record past the 4G barrier. */
-		ok1(tdb_expand_file(tdb, 100) == TDB_SUCCESS);
-		ok1(add_free_record(tdb, 0xFFFFFFF0, 100, TDB_LOCK_WAIT, false)
-		    == TDB_SUCCESS);
-
-		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-
-		/* Test allocation path. */
-		k = tdb_mkdata("key", 4);
-		d = tdb_mkdata("data", 5);
-		ok1(tdb_store(tdb, k, d, TDB_INSERT) == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-
-		/* Make sure it put it at end as we expected. */
-		off = find_and_lock(tdb, k, F_RDLCK, &h, &rec, NULL);
-		ok1(off >= 0xFFFFFFF0);
-		tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
-
-		ok1(tdb_fetch(tdb, k, &d) == 0);
-		ok1(d.dsize == 5);
-		ok1(strcmp((char *)d.dptr, "data") == 0);
-		free(d.dptr);
-
-		ok1(tdb_delete(tdb, k) == 0);
-		ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-
-		tdb_close(tdb);
-	}
-
-	/* We might get messages about mmap failing, so don't test
-	 * tap_log_messages */
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-90-get-set-attributes.c b/lib/tdb2/test/run-90-get-set-attributes.c
deleted file mode 100644
index edf0735013..0000000000
--- a/lib/tdb2/test/run-90-get-set-attributes.c
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
-		  void *unused)
-{
-	return 0;
-}
-
-static int myunlock(int fd, int rw, off_t off, off_t len, void *unused)
-{
-	return 0;
-}
-
-static uint64_t hash_fn(const void *key, size_t len, uint64_t seed,
-			void *priv)
-{
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	union tdb_attribute seed_attr;
-	union tdb_attribute hash_attr;
-	union tdb_attribute lock_attr;
-
-	seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
-	seed_attr.base.next = &hash_attr;
-	seed_attr.seed.seed = 100;
-
-	hash_attr.base.attr = TDB_ATTRIBUTE_HASH;
-	hash_attr.base.next = &lock_attr;
-	hash_attr.hash.fn = hash_fn;
-	hash_attr.hash.data = &hash_attr;
-
-	lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK;
-	lock_attr.base.next = &tap_log_attr;
-	lock_attr.flock.lock = mylock;
-	lock_attr.flock.unlock = myunlock;
-	lock_attr.flock.data = &lock_attr;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 50);
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		union tdb_attribute attr;
-
-		/* First open with no attributes. */
-		tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
-		ok1(tdb);
-
-		/* Get log on no attributes will fail */
-		attr.base.attr = TDB_ATTRIBUTE_LOG;
-		ok1(tdb_get_attribute(tdb, &attr) == TDB_ERR_NOEXIST);
-		/* These always work. */
-		attr.base.attr = TDB_ATTRIBUTE_HASH;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_HASH);
-		ok1(attr.hash.fn == tdb_jenkins_hash);
-		attr.base.attr = TDB_ATTRIBUTE_FLOCK;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
-		ok1(attr.flock.lock == tdb_fcntl_lock);
-		ok1(attr.flock.unlock == tdb_fcntl_unlock);
-		attr.base.attr = TDB_ATTRIBUTE_SEED;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_SEED);
-		/* This is possible, just astronomically unlikely. */
-		ok1(attr.seed.seed != 0);
-
-		/* Unset attributes. */
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
-
-		/* Set them. */
-		ok1(tdb_set_attribute(tdb, &tap_log_attr) == 0);
-		ok1(tdb_set_attribute(tdb, &lock_attr) == 0);
-		/* These should fail. */
-		ok1(tdb_set_attribute(tdb, &seed_attr) == TDB_ERR_EINVAL);
-		ok1(tap_log_messages == 1);
-		ok1(tdb_set_attribute(tdb, &hash_attr) == TDB_ERR_EINVAL);
-		ok1(tap_log_messages == 2);
-		tap_log_messages = 0;
-
-		/* Getting them should work as expected. */
-		attr.base.attr = TDB_ATTRIBUTE_LOG;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_LOG);
-		ok1(attr.log.fn == tap_log_attr.log.fn);
-		ok1(attr.log.data == tap_log_attr.log.data);
-
-		attr.base.attr = TDB_ATTRIBUTE_FLOCK;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
-		ok1(attr.flock.lock == mylock);
-		ok1(attr.flock.unlock == myunlock);
-		ok1(attr.flock.data == &lock_attr);
-
-		/* Unset them again. */
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
-		ok1(tap_log_messages == 0);
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
-		ok1(tap_log_messages == 0);
-
-		tdb_close(tdb);
-		ok1(tap_log_messages == 0);
-
-		/* Now open with all attributes. */
-		tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600,
-			       &seed_attr);
-
-		ok1(tdb);
-
-		/* Get will succeed */
-		attr.base.attr = TDB_ATTRIBUTE_LOG;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_LOG);
-		ok1(attr.log.fn == tap_log_attr.log.fn);
-		ok1(attr.log.data == tap_log_attr.log.data);
-
-		attr.base.attr = TDB_ATTRIBUTE_HASH;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_HASH);
-		ok1(attr.hash.fn == hash_fn);
-		ok1(attr.hash.data == &hash_attr);
-
-		attr.base.attr = TDB_ATTRIBUTE_FLOCK;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
-		ok1(attr.flock.lock == mylock);
-		ok1(attr.flock.unlock == myunlock);
-		ok1(attr.flock.data == &lock_attr);
-
-		attr.base.attr = TDB_ATTRIBUTE_SEED;
-		ok1(tdb_get_attribute(tdb, &attr) == 0);
-		ok1(attr.base.attr == TDB_ATTRIBUTE_SEED);
-		ok1(attr.seed.seed == seed_attr.seed.seed);
-
-		/* Unset attributes. */
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_HASH);
-		ok1(tap_log_messages == 1);
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_SEED);
-		ok1(tap_log_messages == 2);
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
-		tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
-		ok1(tap_log_messages == 2);
-		tap_log_messages = 0;
-
-		tdb_close(tdb);
-
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-capabilities.c b/lib/tdb2/test/run-capabilities.c
deleted file mode 100644
index 1501abbe5c..0000000000
--- a/lib/tdb2/test/run-capabilities.c
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <ccan/failtest/failtest_override.h>
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-#include "layout.h"
-#include "failtest_helper.h"
-#include <stdarg.h>
-
-static size_t len_of(bool breaks_check, bool breaks_write, bool breaks_open)
-{
-	size_t len = 0;
-	if (breaks_check)
-		len += 8;
-	if (breaks_write)
-		len += 16;
-	if (breaks_open)
-		len += 32;
-	return len;
-}
-
-/* Creates a TDB with various capabilities. */
-static void create_tdb(const char *name,
-		       unsigned int cap,
-		       bool breaks_check,
-		       bool breaks_write,
-		       bool breaks_open, ...)
-{
-	TDB_DATA key, data;
-	va_list ap;
-	struct tdb_layout *layout;
-	struct tdb_context *tdb;
-	int fd;
-
-	key = tdb_mkdata("Hello", 5);
-	data = tdb_mkdata("world", 5);
-
-	/* Create a TDB with some data, and some capabilities */
-	layout = new_tdb_layout();
-	tdb_layout_add_freetable(layout);
-	tdb_layout_add_used(layout, key, data, 6);
-	tdb_layout_add_free(layout, 80, 0);
-	tdb_layout_add_capability(layout, cap,
-				  breaks_write, breaks_check, breaks_open,
-				  len_of(breaks_check, breaks_write, breaks_open));
-
-	va_start(ap, breaks_open);
-	while ((cap = va_arg(ap, int)) != 0) {
-		breaks_check = va_arg(ap, int);
-		breaks_write = va_arg(ap, int);
-		breaks_open = va_arg(ap, int);
-
-		key.dsize--;
-		tdb_layout_add_used(layout, key, data, 11 - key.dsize);
-		tdb_layout_add_free(layout, 80, 0);
-		tdb_layout_add_capability(layout, cap,
-					  breaks_write, breaks_check,
-					  breaks_open,
-					  len_of(breaks_check, breaks_write,
-						 breaks_open));
-	}
-	va_end(ap);
-
-	/* We open-code this, because we need to use the failtest write. */
-	tdb = tdb_layout_get(layout, failtest_free, &tap_log_attr);
-
-	fd = open(name, O_RDWR|O_TRUNC|O_CREAT, 0600);
-	if (fd < 0)
-		err(1, "opening %s for writing", name);
-	if (write(fd, tdb->file->map_ptr, tdb->file->map_size)
-	    != tdb->file->map_size)
-		err(1, "writing %s", name);
-	close(fd);
-	tdb_close(tdb);
-	tdb_layout_free(layout);
-}
-
-/* Note all the "goto out" early exits: they're to shorten failtest time. */
-int main(int argc, char *argv[])
-{
-	struct tdb_context *tdb;
-	char *summary;
-
-	failtest_init(argc, argv);
-	failtest_hook = block_repeat_failures;
-	failtest_exit_check = exit_check_log;
-	plan_tests(60);
-
-	failtest_suppress = true;
-	/* Capability says you can ignore it? */
-	create_tdb("run-capabilities.tdb", 1, false, false, false, 0);
-
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	if (!ok1(tdb))
-		goto out;
-	ok1(tap_log_messages == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-	ok1(tap_log_messages == 0);
-	tdb_close(tdb);
-
-	/* Two capabilitues say you can ignore them? */
-	create_tdb("run-capabilities.tdb",
-		   1, false, false, false,
-		   2, false, false, false, 0);
-
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	if (!ok1(tdb))
-		goto out;
-	ok1(tap_log_messages == 0);
-	ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-	ok1(tap_log_messages == 0);
-	ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
-	ok1(strstr(summary, "Capability 1\n"));
-	free(summary);
-	tdb_close(tdb);
-
-	/* Capability says you can't check. */
-	create_tdb("run-capabilities.tdb",
-		   1, false, false, false,
-		   2, true, false, false, 0);
-
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	if (!ok1(tdb))
-		goto out;
-	ok1(tap_log_messages == 0);
-	ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK);
-	ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-	/* We expect a warning! */
-	ok1(tap_log_messages == 1);
-	ok1(strstr(log_last, "capabilit"));
-	ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
-	ok1(strstr(summary, "Capability 1\n"));
-	ok1(strstr(summary, "Capability 2 (uncheckable)\n"));
-	free(summary);
-	tdb_close(tdb);
-
-	/* Capability says you can't write. */
-	create_tdb("run-capabilities.tdb",
-		   1, false, false, false,
-		   2, false, true, false, 0);
-
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	/* We expect a message. */
-	ok1(!tdb);
-	if (!ok1(tap_log_messages == 2))
-		goto out;
-	if (!ok1(strstr(log_last, "unknown")))
-		goto out;
-	ok1(strstr(log_last, "write"));
-
-	/* We can open it read-only though! */
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	if (!ok1(tdb))
-		goto out;
-	ok1(tap_log_messages == 2);
-	ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-	ok1(tap_log_messages == 2);
-	ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
-	ok1(strstr(summary, "Capability 1\n"));
-	ok1(strstr(summary, "Capability 2 (read-only)\n"));
-	free(summary);
-	tdb_close(tdb);
-
-	/* Capability says you can't open. */
-	create_tdb("run-capabilities.tdb",
-		   1, false, false, false,
-		   2, false, false, true, 0);
-
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	/* We expect a message. */
-	ok1(!tdb);
-	if (!ok1(tap_log_messages == 3))
-		goto out;
-	if (!ok1(strstr(log_last, "unknown")))
-		goto out;
-
-	/* Combine capabilities correctly. */
-	create_tdb("run-capabilities.tdb",
-		   1, false, false, false,
-		   2, true, false, false,
-		   3, false, true, false, 0);
-
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	/* We expect a message. */
-	ok1(!tdb);
-	if (!ok1(tap_log_messages == 4))
-		goto out;
-	if (!ok1(strstr(log_last, "unknown")))
-		goto out;
-	ok1(strstr(log_last, "write"));
-
-	/* We can open it read-only though! */
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	if (!ok1(tdb))
-		goto out;
-	ok1(tap_log_messages == 4);
-	ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK);
-	ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-	/* We expect a warning! */
-	ok1(tap_log_messages == 5);
-	ok1(strstr(log_last, "unknown"));
-	ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
-	ok1(strstr(summary, "Capability 1\n"));
-	ok1(strstr(summary, "Capability 2 (uncheckable)\n"));
-	ok1(strstr(summary, "Capability 3 (read-only)\n"));
-	free(summary);
-	tdb_close(tdb);
-
-	/* Two capability flags in one. */
-	create_tdb("run-capabilities.tdb",
-		   1, false, false, false,
-		   2, true, true, false,
-		   0);
-
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	/* We expect a message. */
-	ok1(!tdb);
-	if (!ok1(tap_log_messages == 6))
-		goto out;
-	if (!ok1(strstr(log_last, "unknown")))
-		goto out;
-	ok1(strstr(log_last, "write"));
-
-	/* We can open it read-only though! */
-	failtest_suppress = false;
-	tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0,
-		       &tap_log_attr);
-	failtest_suppress = true;
-	if (!ok1(tdb))
-		goto out;
-	ok1(tap_log_messages == 6);
-	ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK);
-	ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
-	/* We expect a warning! */
-	ok1(tap_log_messages == 7);
-	ok1(strstr(log_last, "unknown"));
-	ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS);
-	ok1(strstr(summary, "Capability 1\n"));
-	ok1(strstr(summary, "Capability 2 (uncheckable,read-only)\n"));
-	free(summary);
-	tdb_close(tdb);
-
-out:
-	failtest_exit(exit_status());
-}
diff --git a/lib/tdb2/test/run-expand-in-transaction.c b/lib/tdb2/test/run-expand-in-transaction.c
deleted file mode 100644
index 6b22d2ef46..0000000000
--- a/lib/tdb2/test/run-expand-in-transaction.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = tdb_mkdata("key", 3);
-	struct tdb_data data = tdb_mkdata("data", 4);
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
-
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		size_t size;
-		tdb = tdb_open("run-expand-in-transaction.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		size = tdb->file->map_size;
-		ok1(tdb_transaction_start(tdb) == 0);
-		ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
-		ok1(tdb->file->map_size > size);
-		ok1(tdb_transaction_commit(tdb) == 0);
-		ok1(tdb->file->map_size > size);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-features.c b/lib/tdb2/test/run-features.c
deleted file mode 100644
index f552fcfb58..0000000000
--- a/lib/tdb2/test/run-features.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j;
-	struct tdb_context *tdb;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
-	struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		uint64_t features;
-		tdb = tdb_open("run-features.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		/* Put some stuff in there. */
-		for (j = 0; j < 100; j++) {
-			if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-				fail("Storing in tdb");
-		}
-
-		/* Mess with features fields in hdr. */
-		features = (~TDB_FEATURE_MASK ^ 1);
-		ok1(tdb_write_convert(tdb, offsetof(struct tdb_header,
-						    features_used),
-				      &features, sizeof(features)) == 0);
-		ok1(tdb_write_convert(tdb, offsetof(struct tdb_header,
-						    features_offered),
-				      &features, sizeof(features)) == 0);
-		tdb_close(tdb);
-
-		tdb = tdb_open("run-features.tdb", flags[i], O_RDWR, 0,
-			       &tap_log_attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		/* Should not have changed features offered. */
-		ok1(tdb_read_convert(tdb, offsetof(struct tdb_header,
-						   features_offered),
-				     &features, sizeof(features)) == 0);
-		ok1(features == (~TDB_FEATURE_MASK ^ 1));
-
-		/* Should have cleared unknown bits in features_used. */
-		ok1(tdb_read_convert(tdb, offsetof(struct tdb_header,
-						   features_used),
-				     &features, sizeof(features)) == 0);
-		ok1(features == (1 & TDB_FEATURE_MASK));
-
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-lockall.c b/lib/tdb2/test/run-lockall.c
deleted file mode 100644
index 3ae0d14f65..0000000000
--- a/lib/tdb2/test/run-lockall.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "private.h"
-#include <unistd.h>
-#include "lock-tracking.h"
-
-#define fcntl fcntl_with_lockcheck
-#include "tdb2-source.h"
-
-#include "tap-interface.h"
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdarg.h>
-#include <ccan/err/err.h>
-#include "external-agent.h"
-#include "logging.h"
-
-#define TEST_DBNAME "run-lockall.tdb"
-
-#undef fcntl
-
-int main(int argc, char *argv[])
-{
-	struct agent *agent;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-	int i;
-
-	plan_tests(13 * sizeof(flags)/sizeof(flags[0]) + 1);
-	agent = prepare_external_agent();
-	if (!agent)
-		err(1, "preparing agent");
-
-	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
-		enum agent_return ret;
-		struct tdb_context *tdb;
-
-		tdb = tdb_open(TEST_DBNAME, flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		ok1(tdb);
-
-		ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
-		ok1(ret == SUCCESS);
-
-		ok1(tdb_lockall(tdb) == TDB_SUCCESS);
-		ok1(external_agent_operation(agent, STORE, "key")
-		    == WOULD_HAVE_BLOCKED);
-		ok1(external_agent_operation(agent, FETCH, "key")
-		    == WOULD_HAVE_BLOCKED);
-		/* Test nesting. */
-		ok1(tdb_lockall(tdb) == TDB_SUCCESS);
-		tdb_unlockall(tdb);
-		tdb_unlockall(tdb);
-
-		ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
-
-		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
-		ok1(external_agent_operation(agent, STORE, "key")
-		    == WOULD_HAVE_BLOCKED);
-		ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
-		ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
-		tdb_unlockall_read(tdb);
-		tdb_unlockall_read(tdb);
-
-		ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
-		ok1(external_agent_operation(agent, CLOSE, NULL) == SUCCESS);
-		tdb_close(tdb);
-	}
-
-	free_external_agent(agent);
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-remap-in-read_traverse.c b/lib/tdb2/test/run-remap-in-read_traverse.c
deleted file mode 100644
index 16a1baab46..0000000000
--- a/lib/tdb2/test/run-remap-in-read_traverse.c
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "tdb2-source.h"
-/* We had a bug where we marked the tdb read-only for a tdb_traverse_read.
- * If we then expanded the tdb, we would remap read-only, and later SEGV. */
-#include "tap-interface.h"
-#include "external-agent.h"
-#include "logging.h"
-
-static bool file_larger(int fd, tdb_len_t size)
-{
-	struct stat st;
-
-	fstat(fd, &st);
-	return st.st_size != size;
-}
-
-static unsigned add_records_to_grow(struct agent *agent, int fd, tdb_len_t size)
-{
-	unsigned int i;
-
-	for (i = 0; !file_larger(fd, size); i++) {
-		char data[20];
-		sprintf(data, "%i", i);
-		if (external_agent_operation(agent, STORE, data) != SUCCESS)
-			return 0;
-	}
-	diag("Added %u records to grow file", i);
-	return i;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct agent *agent;
-	struct tdb_context *tdb;
-	struct tdb_data d = tdb_mkdata("hello", 5);
-	const char filename[] = "run-remap-in-read_traverse.tdb";
-
-	plan_tests(4);
-
-	agent = prepare_external_agent();
-
-	tdb = tdb_open(filename, TDB_DEFAULT,
-		       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-
-	ok1(external_agent_operation(agent, OPEN, filename) == SUCCESS);
-	i = add_records_to_grow(agent, tdb->file->fd, tdb->file->map_size);
-
-	/* Do a traverse. */
-	ok1(tdb_traverse(tdb, NULL, NULL) == i);
-
-	/* Now store something! */
-	ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0);
-	ok1(tap_log_messages == 0);
-	tdb_close(tdb);
-	free_external_agent(agent);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-seed.c b/lib/tdb2/test/run-seed.c
deleted file mode 100644
index 9c90833001..0000000000
--- a/lib/tdb2/test/run-seed.c
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-static int log_count = 0;
-
-/* Normally we get a log when setting random seed. */
-static void my_log_fn(struct tdb_context *tdb,
-		      enum tdb_log_level level,
-		      enum TDB_ERROR ecode,
-		      const char *message, void *priv)
-{
-	log_count++;
-}
-
-static union tdb_attribute log_attr = {
-	.log = { .base = { .attr = TDB_ATTRIBUTE_LOG },
-		 .fn = my_log_fn }
-};
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	struct tdb_context *tdb;
-	union tdb_attribute attr;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-
-	attr.seed.base.attr = TDB_ATTRIBUTE_SEED;
-	attr.seed.base.next = &log_attr;
-	attr.seed.seed = 42;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 4 * 3);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		struct tdb_header hdr;
-		int fd;
-		tdb = tdb_open("run-seed.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &attr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(tdb->hash_seed == 42);
-		ok1(log_count == 0);
-		tdb_close(tdb);
-
-		if (flags[i] & TDB_INTERNAL)
-			continue;
-
-		fd = open("run-seed.tdb", O_RDONLY);
-		ok1(fd >= 0);
-		ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr));
-		if (flags[i] & TDB_CONVERT)
-			ok1(bswap_64(hdr.hash_seed) == 42);
-		else
-			ok1(hdr.hash_seed == 42);
-		close(fd);
-	}
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-tdb_errorstr.c b/lib/tdb2/test/run-tdb_errorstr.c
deleted file mode 100644
index 7a2da251aa..0000000000
--- a/lib/tdb2/test/run-tdb_errorstr.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-
-int main(int argc, char *argv[])
-{
-	enum TDB_ERROR e;
-	plan_tests(TDB_ERR_RDONLY*-1 + 2);
-
-	for (e = TDB_SUCCESS; e >= TDB_ERR_RDONLY; e--) {
-		switch (e) {
-		case TDB_SUCCESS:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "Success"));
-			break;
-		case TDB_ERR_IO:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "IO Error"));
-			break;
-		case TDB_ERR_LOCK:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "Locking error"));
-			break;
-		case TDB_ERR_OOM:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "Out of memory"));
-			break;
-		case TDB_ERR_EXISTS:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "Record exists"));
-			break;
-		case TDB_ERR_EINVAL:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "Invalid parameter"));
-			break;
-		case TDB_ERR_NOEXIST:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "Record does not exist"));
-			break;
-		case TDB_ERR_RDONLY:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "write not permitted"));
-			break;
-		case TDB_ERR_CORRUPT:
-			ok1(!strcmp(tdb_errorstr(e),
-				    "Corrupt database"));
-			break;
-		}
-	}
-	ok1(!strcmp(tdb_errorstr(e), "Invalid error code"));
-
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-tdb_foreach.c b/lib/tdb2/test/run-tdb_foreach.c
deleted file mode 100644
index b1eb2de217..0000000000
--- a/lib/tdb2/test/run-tdb_foreach.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-static int drop_count(struct tdb_context *tdb, unsigned int *count)
-{
-	if (--(*count) == 0)
-		return 1;
-	return 0;
-}
-
-static int set_found(struct tdb_context *tdb, bool found[3])
-{
-	unsigned int idx;
-
-	if (strcmp(tdb_name(tdb), "run-tdb_foreach0.tdb") == 0)
-		idx = 0;
-	else if (strcmp(tdb_name(tdb), "run-tdb_foreach1.tdb") == 0)
-		idx = 1;
-	else if (strcmp(tdb_name(tdb), "run-tdb_foreach2.tdb") == 0)
-		idx = 2;
-	else
-		abort();
-
-	if (found[idx])
-		abort();
-	found[idx] = true;
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, count;
-	bool found[3];
-	struct tdb_context *tdb0, *tdb1, *tdb2;
-	int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
-			TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb0 = tdb_open("run-tdb_foreach0.tdb", flags[i],
-				O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		tdb1 = tdb_open("run-tdb_foreach1.tdb", flags[i],
-				O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-		tdb2 = tdb_open("run-tdb_foreach2.tdb", flags[i],
-				O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
-
-		memset(found, 0, sizeof(found));
-		tdb_foreach(set_found, found);
-		ok1(found[0] && found[1] && found[2]);
-
-		/* Test premature iteration termination */
-		count = 1;
-		tdb_foreach(drop_count, &count);
-		ok1(count == 0);
-
-		tdb_close(tdb1);
-		memset(found, 0, sizeof(found));
-		tdb_foreach(set_found, found);
-		ok1(found[0] && !found[1] && found[2]);
-
-		tdb_close(tdb2);
-		memset(found, 0, sizeof(found));
-		tdb_foreach(set_found, found);
-		ok1(found[0] && !found[1] && !found[2]);
-
-		tdb1 = tdb_open("run-tdb_foreach1.tdb", flags[i],
-				O_RDWR, 0600, &tap_log_attr);
-		memset(found, 0, sizeof(found));
-		tdb_foreach(set_found, found);
-		ok1(found[0] && found[1] && !found[2]);
-
-		tdb_close(tdb0);
-		memset(found, 0, sizeof(found));
-		tdb_foreach(set_found, found);
-		ok1(!found[0] && found[1] && !found[2]);
-
-		tdb_close(tdb1);
-		memset(found, 0, sizeof(found));
-		tdb_foreach(set_found, found);
-		ok1(!found[0] && !found[1] && !found[2]);
-		ok1(tap_log_messages == 0);
-	}
-
-	return exit_status();
-}
diff --git a/lib/tdb2/test/run-traverse.c b/lib/tdb2/test/run-traverse.c
deleted file mode 100644
index 20d610fe66..0000000000
--- a/lib/tdb2/test/run-traverse.c
+++ /dev/null
@@ -1,203 +0,0 @@
-#include "tdb2-source.h"
-#include "tap-interface.h"
-#include "logging.h"
-
-#define NUM_RECORDS 1000
-
-/* We use the same seed which we saw a failure on. */
-static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
-{
-	return hash64_stable((const unsigned char *)key, len,
-			     *(uint64_t *)p);
-}
-
-static bool store_records(struct tdb_context *tdb)
-{
-	int i;
-	struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
-	struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
-
-	for (i = 0; i < NUM_RECORDS; i++)
-		if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
-			return false;
-	return true;
-}
-
-struct trav_data {
-	unsigned int calls, call_limit;
-	int low, high;
-	bool mismatch;
-	bool delete;
-	enum TDB_ERROR delete_error;
-};
-
-static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
-		struct trav_data *td)
-{
-	int val;
-
-	td->calls++;
-	if (key.dsize != sizeof(val) || dbuf.dsize != sizeof(val)
-	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
-		td->mismatch = true;
-		return -1;
-	}
-	memcpy(&val, dbuf.dptr, dbuf.dsize);
-	if (val < td->low)
-		td->low = val;
-	if (val > td->high)
-		td->high = val;
-
-	if (td->delete) {
-		td->delete_error = tdb_delete(tdb, key);
-		if (td->delete_error != TDB_SUCCESS) {
-			return -1;
-		}
-	}
-
-	if (td->calls == td->call_limit)
-		return 1;
-	return 0;
-}
-
-struct trav_grow_data {
-	unsigned int calls;
-	unsigned int num_large;
-	bool mismatch;
-	enum TDB_ERROR error;
-};
-
-static int trav_grow(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
-		     struct trav_grow_data *tgd)
-{
-	int val;
-	unsigned char buffer[128] = { 0 };
-
-	tgd->calls++;
-	if (key.dsize != sizeof(val) || dbuf.dsize < sizeof(val)
-	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
-		tgd->mismatch = true;
-		return -1;
-	}
-
-	if (dbuf.dsize > sizeof(val))
-		/* We must have seen this before! */
-		tgd->num_large++;
-
-	/* Make a big difference to the database. */
-	dbuf.dptr = buffer;
-	dbuf.dsize = sizeof(buffer);
-	tgd->error = tdb_append(tdb, key, dbuf);
-	if (tgd->error != TDB_SUCCESS) {
-		return -1;
-	}
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i;
-	int num;
-	struct trav_data td;
-	struct trav_grow_data tgd;
-	struct tdb_context *tdb;
-	uint64_t seed = 16014841315512641303ULL;
-	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
-			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
-			TDB_NOMMAP|TDB_CONVERT };
-	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
-						.fn = fixedhash,
-						.data = &seed } };
-
-	hattr.base.next = &tap_log_attr;
-
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 32 + 1);
-	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
-		tdb = tdb_open("run-traverse.tdb", flags[i],
-			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
-		ok1(tdb);
-		if (!tdb)
-			continue;
-
-		ok1(tdb_traverse(tdb, NULL, NULL) == 0);
-
-		ok1(store_records(tdb));
-		num = tdb_traverse(tdb, NULL, NULL);
-		ok1(num == NUM_RECORDS);
-
-		/* Full traverse. */
-		td.calls = 0;
-		td.call_limit = UINT_MAX;
-		td.low = INT_MAX;
-		td.high = INT_MIN;
-		td.mismatch = false;
-		td.delete = false;
-
-		num = tdb_traverse(tdb, trav, &td);
-		ok1(num == NUM_RECORDS);
-		ok1(!td.mismatch);
-		ok1(td.calls == NUM_RECORDS);
-		ok1(td.low == 0);
-		ok1(td.high == NUM_RECORDS-1);
-
-		/* Short traverse. */
-		td.calls = 0;
-		td.call_limit = NUM_RECORDS / 2;
-		td.low = INT_MAX;
-		td.high = INT_MIN;
-		td.mismatch = false;
-		td.delete = false;
-
-		num = tdb_traverse(tdb, trav, &td);
-		ok1(num == NUM_RECORDS / 2);
-		ok1(!td.mismatch);
-		ok1(td.calls == NUM_RECORDS / 2);
-		ok1(td.low <= NUM_RECORDS / 2);
-		ok1(td.high > NUM_RECORDS / 2);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(tap_log_messages == 0);
-
-		/* Deleting traverse (delete everything). */
-		td.calls = 0;
-		td.call_limit = UINT_MAX;
-		td.low = INT_MAX;
-		td.high = INT_MIN;
-		td.mismatch = false;
-		td.delete = true;
-		td.delete_error = TDB_SUCCESS;
-		num = tdb_traverse(tdb, trav, &td);
-		ok1(num == NUM_RECORDS);
-		ok1(td.delete_error == TDB_SUCCESS);
-		ok1(!td.mismatch);
-		ok1(td.calls == NUM_RECORDS);
-		ok1(td.low == 0);
-		ok1(td.high == NUM_RECORDS - 1);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Now it's empty! */
-		ok1(tdb_traverse(tdb, NULL, NULL) == 0);
-
-		/* Re-add. */
-		ok1(store_records(tdb));
-		ok1(tdb_traverse(tdb, NULL, NULL) == NUM_RECORDS);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-
-		/* Grow.  This will cause us to be reshuffled. */
-		tgd.calls = 0;
-		tgd.num_large = 0;
-		tgd.mismatch = false;
-		tgd.error = TDB_SUCCESS;
-		ok1(tdb_traverse(tdb, trav_grow, &tgd) > 1);
-		ok1(tgd.error == 0);
-		ok1(!tgd.mismatch);
-		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(tgd.num_large < tgd.calls);
-		diag("growing db: %u calls, %u repeats",
-		     tgd.calls, tgd.num_large);
-
-		tdb_close(tdb);
-	}
-
-	ok1(tap_log_messages == 0);
-	return exit_status();
-}
diff --git a/lib/tdb2/test/tap-interface.c b/lib/tdb2/test/tap-interface.c
deleted file mode 100644
index 077ec2cd9a..0000000000
--- a/lib/tdb2/test/tap-interface.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "tap-interface.h"
-
-unsigned tap_ok_count, tap_ok_target = -1U;
diff --git a/lib/tdb2/test/tap-interface.h b/lib/tdb2/test/tap-interface.h
deleted file mode 100644
index f3d4ec2545..0000000000
--- a/lib/tdb2/test/tap-interface.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-   Unix SMB/CIFS implementation.
-   Simplistic implementation of tap interface.
-
-   Copyright (C) Rusty Russell 2012
-
-     ** NOTE! The following LGPL license applies to the talloc
-     ** library. This does NOT imply that all of Samba is released
-     ** under the LGPL
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include <stdio.h>
-#include <ccan/err/err.h>
-
-#ifndef __location__
-#define __TAP_STRING_LINE1__(s)    #s
-#define __TAP_STRING_LINE2__(s)   __TAP_STRING_LINE1__(s)
-#define __TAP_STRING_LINE3__  __TAP_STRING_LINE2__(__LINE__)
-#define __location__ __FILE__ ":" __TAP_STRING_LINE3__
-#endif
-
-extern unsigned tap_ok_count, tap_ok_target;
-#define plan_tests(num) do { tap_ok_target = (num); } while(0)
-#define ok(e, ...) ((e) ? (printf("."), tap_ok_count++, true) : (warnx(__VA_ARGS__), false))
-#define ok1(e) ok((e), "%s:%s", __location__, #e)
-#define pass(...) (printf("."), tap_ok_count++)
-#define fail(...) warnx(__VA_ARGS__)
-#define diag printf
-#define exit_status() (tap_ok_count == tap_ok_target ? 0 : 1)
diff --git a/lib/tdb2/test/tdb2-source.h b/lib/tdb2/test/tdb2-source.h
deleted file mode 100644
index d13d8b868c..0000000000
--- a/lib/tdb2/test/tdb2-source.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "config.h"
-#include "check.c"
-#include "free.c"
-#include "hash.c"
-#include "io.c"
-#include "lock.c"
-#include "open.c"
-#include "summary.c"
-#include "tdb.c"
-#include "transaction.c"
-#include "traverse.c"
diff --git a/lib/tdb2/tools/Makefile b/lib/tdb2/tools/Makefile
deleted file mode 100644
index 11188c3baf..0000000000
--- a/lib/tdb2/tools/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-OBJS:=../../tdb2.o ../../hash.o ../../tally.o
-CFLAGS:=-I../../.. -I.. -Wall -g -O3 #-g -pg
-LDFLAGS:=-L../../..
-
-default: tdb2torture tdb2tool tdb2dump tdb2restore mktdb2 speed growtdb-bench
-
-tdb2dump: tdb2dump.c $(OBJS)
-tdb2restore: tdb2restore.c $(OBJS)
-tdb2torture: tdb2torture.c $(OBJS)
-tdb2tool: tdb2tool.c $(OBJS)
-mktdb2: mktdb2.c $(OBJS)
-speed: speed.c $(OBJS)
-growtdb-bench: growtdb-bench.c $(OBJS)
-
-clean:
-	rm -f tdb2torture tdb2dump tdb2restore tdb2tool mktdb2 speed growtdb-bench
diff --git a/lib/tdb2/tools/growtdb-bench.c b/lib/tdb2/tools/growtdb-bench.c
deleted file mode 100644
index 476e8be5da..0000000000
--- a/lib/tdb2/tools/growtdb-bench.c
+++ /dev/null
@@ -1,114 +0,0 @@
-#include "tdb2.h"
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <ccan/err/err.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-static void logfn(struct tdb_context *tdb,
-		  enum tdb_log_level level,
-		  enum TDB_ERROR ecode,
-		  const char *message,
-		  void *data)
-{
-	fprintf(stderr, "tdb:%s:%s:%s\n",
-		tdb_name(tdb), tdb_errorstr(ecode), message);
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j, users, groups;
-	TDB_DATA idxkey, idxdata;
-	TDB_DATA k, d, gk;
-	char cmd[100];
-	struct tdb_context *tdb;
-	enum TDB_ERROR ecode;
-	union tdb_attribute log;
-
-	if (argc != 3) {
-		printf("Usage: growtdb-bench <users> <groups>\n");
-		exit(1);
-	}
-	users = atoi(argv[1]);
-	groups = atoi(argv[2]);
-
-	sprintf(cmd, "cat /proc/%i/statm", getpid());
-
-	log.base.attr = TDB_ATTRIBUTE_LOG;
-	log.base.next = NULL;
-	log.log.fn = logfn;
-
-	tdb = tdb_open("/tmp/growtdb.tdb", TDB_DEFAULT,
-		       O_RDWR|O_CREAT|O_TRUNC, 0600, &log);
-
-	idxkey.dptr = (unsigned char *)"User index";
-	idxkey.dsize = strlen("User index");
-	idxdata.dsize = 51;
-	idxdata.dptr = calloc(idxdata.dsize, 1);
-
-	/* Create users. */
-	k.dsize = 48;
-	k.dptr = calloc(k.dsize, 1);
-	d.dsize = 64;
-	d.dptr = calloc(d.dsize, 1);
-
-	tdb_transaction_start(tdb);
-	for (i = 0; i < users; i++) {
-		memcpy(k.dptr, &i, sizeof(i));
-		ecode = tdb_store(tdb, k, d, TDB_INSERT);
-		if (ecode != TDB_SUCCESS)
-			errx(1, "tdb insert failed: %s", tdb_errorstr(ecode));
-
-		/* This simulates a growing index record. */
-		ecode = tdb_append(tdb, idxkey, idxdata);
-		if (ecode != TDB_SUCCESS)
-			errx(1, "tdb append failed: %s", tdb_errorstr(ecode));
-	}
-	if ((ecode = tdb_transaction_commit(tdb)) != 0)
-		errx(1, "tdb commit1 failed: %s", tdb_errorstr(ecode));
-
-	if ((ecode = tdb_check(tdb, NULL, NULL)) != 0)
-		errx(1, "tdb_check failed after initial insert!");
-
-	system(cmd);
-
-	/* Now put them all in groups: add 32 bytes to each record for
-	 * a group. */
-	gk.dsize = 48;
-	gk.dptr = calloc(k.dsize, 1);
-	gk.dptr[gk.dsize-1] = 1;
-
-	d.dsize = 32;
-	for (i = 0; i < groups; i++) {
-		tdb_transaction_start(tdb);
-		/* Create the "group". */
-		memcpy(gk.dptr, &i, sizeof(i));
-		ecode = tdb_store(tdb, gk, d, TDB_INSERT);
-		if (ecode != TDB_SUCCESS)
-			errx(1, "tdb insert failed: %s", tdb_errorstr(ecode));
-
-		/* Now populate it. */
-		for (j = 0; j < users; j++) {
-			/* Append to the user. */
-			memcpy(k.dptr, &j, sizeof(j));
-			if ((ecode = tdb_append(tdb, k, d)) != 0)
-				errx(1, "tdb append failed: %s",
-				     tdb_errorstr(ecode));
-
-			/* Append to the group. */
-			if ((ecode = tdb_append(tdb, gk, d)) != 0)
-				errx(1, "tdb append failed: %s",
-				     tdb_errorstr(ecode));
-		}
-		if ((ecode = tdb_transaction_commit(tdb)) != 0)
-			errx(1, "tdb commit2 failed: %s", tdb_errorstr(ecode));
-		if ((ecode = tdb_check(tdb, NULL, NULL)) != 0)
-			errx(1, "tdb_check failed after iteration %i!", i);
-		system(cmd);
-	}
-
-	return 0;
-}
diff --git a/lib/tdb2/tools/mktdb2.c b/lib/tdb2/tools/mktdb2.c
deleted file mode 100644
index 35d7a07d0b..0000000000
--- a/lib/tdb2/tools/mktdb2.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "tdb2.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <ccan/err/err.h>
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, num_recs;
-	struct tdb_context *tdb;
-
-	if (argc != 3 || (num_recs = atoi(argv[2])) == 0)
-		errx(1, "Usage: mktdb <tdbfile> <numrecords>");
-
-	tdb = tdb_open(argv[1], TDB_DEFAULT, O_CREAT|O_TRUNC|O_RDWR, 0600,NULL);
-	if (!tdb)
-		err(1, "Opening %s", argv[1]);
-
-	for (i = 0; i < num_recs; i++) {
-		TDB_DATA d;
-
-		d.dptr = (void *)&i;
-		d.dsize = sizeof(i);
-		if (tdb_store(tdb, d, d, TDB_INSERT) != 0)
-			err(1, "Failed to store record %i", i);
-	}
-	printf("Done\n");
-	return 0;
-}
diff --git a/lib/tdb2/tools/speed.c b/lib/tdb2/tools/speed.c
deleted file mode 100644
index 259d53f6c8..0000000000
--- a/lib/tdb2/tools/speed.c
+++ /dev/null
@@ -1,443 +0,0 @@
-/* Simple speed test for TDB */
-#include <ccan/err/err.h>
-#include <time.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include "tdb2.h"
-
-/* Nanoseconds per operation */
-static size_t normalize(const struct timeval *start,
-			const struct timeval *stop,
-			unsigned int num)
-{
-	struct timeval diff;
-
-	timersub(stop, start, &diff);
-
-	/* Floating point is more accurate here. */
-	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
-		/ num * 1000;
-}
-
-static size_t file_size(void)
-{
-	struct stat st;
-
-	if (stat("/tmp/speed.tdb", &st) != 0)
-		return -1;
-	return st.st_size;
-}
-
-static int count_record(struct tdb_context *tdb,
-			TDB_DATA key, TDB_DATA data, void *p)
-{
-	int *total = p;
-	*total += *(int *)data.dptr;
-	return 0;
-}
-
-static void dump_and_clear_stats(struct tdb_context **tdb,
-				 int flags,
-				 union tdb_attribute *attr)
-{
-	union tdb_attribute stats;
-	enum TDB_ERROR ecode;
-
-	stats.base.attr = TDB_ATTRIBUTE_STATS;
-	stats.stats.size = sizeof(stats.stats);
-	ecode = tdb_get_attribute(*tdb, &stats);
-	if (ecode != TDB_SUCCESS)
-		errx(1, "Getting stats: %s", tdb_errorstr(ecode));
-
-	printf("allocs = %llu\n",
-	       (unsigned long long)stats.stats.allocs);
-	printf("  alloc_subhash = %llu\n",
-	       (unsigned long long)stats.stats.alloc_subhash);
-	printf("  alloc_chain = %llu\n",
-	       (unsigned long long)stats.stats.alloc_chain);
-	printf("  alloc_bucket_exact = %llu\n",
-	       (unsigned long long)stats.stats.alloc_bucket_exact);
-	printf("  alloc_bucket_max = %llu\n",
-	       (unsigned long long)stats.stats.alloc_bucket_max);
-	printf("  alloc_leftover = %llu\n",
-	       (unsigned long long)stats.stats.alloc_leftover);
-	printf("  alloc_coalesce_tried = %llu\n",
-	       (unsigned long long)stats.stats.alloc_coalesce_tried);
-	printf("    alloc_coalesce_iterate_clash = %llu\n",
-	       (unsigned long long)stats.stats.alloc_coalesce_iterate_clash);
-	printf("    alloc_coalesce_lockfail = %llu\n",
-	       (unsigned long long)stats.stats.alloc_coalesce_lockfail);
-	printf("    alloc_coalesce_race = %llu\n",
-	       (unsigned long long)stats.stats.alloc_coalesce_race);
-	printf("    alloc_coalesce_succeeded = %llu\n",
-	       (unsigned long long)stats.stats.alloc_coalesce_succeeded);
-	printf("      alloc_coalesce_num_merged = %llu\n",
-	       (unsigned long long)stats.stats.alloc_coalesce_num_merged);
-	printf("compares = %llu\n",
-	       (unsigned long long)stats.stats.compares);
-	printf("  compare_wrong_bucket = %llu\n",
-	       (unsigned long long)stats.stats.compare_wrong_bucket);
-	printf("  compare_wrong_offsetbits = %llu\n",
-	       (unsigned long long)stats.stats.compare_wrong_offsetbits);
-	printf("  compare_wrong_keylen = %llu\n",
-	       (unsigned long long)stats.stats.compare_wrong_keylen);
-	printf("  compare_wrong_rechash = %llu\n",
-	       (unsigned long long)stats.stats.compare_wrong_rechash);
-	printf("  compare_wrong_keycmp = %llu\n",
-	       (unsigned long long)stats.stats.compare_wrong_keycmp);
-	printf("transactions = %llu\n",
-	       (unsigned long long)stats.stats.transactions);
-	printf("  transaction_cancel = %llu\n",
-	       (unsigned long long)stats.stats.transaction_cancel);
-	printf("  transaction_nest = %llu\n",
-	       (unsigned long long)stats.stats.transaction_nest);
-	printf("  transaction_expand_file = %llu\n",
-	       (unsigned long long)stats.stats.transaction_expand_file);
-	printf("  transaction_read_direct = %llu\n",
-	       (unsigned long long)stats.stats.transaction_read_direct);
-	printf("    transaction_read_direct_fail = %llu\n",
-	       (unsigned long long)stats.stats.transaction_read_direct_fail);
-	printf("  transaction_write_direct = %llu\n",
-	       (unsigned long long)stats.stats.transaction_write_direct);
-	printf("    transaction_write_direct_fail = %llu\n",
-	       (unsigned long long)stats.stats.transaction_write_direct_fail);
-	printf("expands = %llu\n",
-	       (unsigned long long)stats.stats.expands);
-	printf("frees = %llu\n",
-	       (unsigned long long)stats.stats.frees);
-	printf("locks = %llu\n",
-	       (unsigned long long)stats.stats.locks);
-	printf("  lock_lowlevel = %llu\n",
-	       (unsigned long long)stats.stats.lock_lowlevel);
-	printf("  lock_nonblock = %llu\n",
-	       (unsigned long long)stats.stats.lock_nonblock);
-	printf("    lock_nonblock_fail = %llu\n",
-	       (unsigned long long)stats.stats.lock_nonblock_fail);
-
-	/* Now clear. */
-	tdb_close(*tdb);
-	*tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR, 0, attr);
-}
-
-static void tdb_log(struct tdb_context *tdb,
-		    enum tdb_log_level level,
-		    enum TDB_ERROR ecode,
-		    const char *message,
-		    void *data)
-{
-	fprintf(stderr, "tdb:%s:%s:%s\n",
-		tdb_name(tdb), tdb_errorstr(ecode), message);
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
-	int flags = TDB_DEFAULT;
-	bool transaction = false, summary = false;
-	TDB_DATA key, data;
-	struct tdb_context *tdb;
-	struct timeval start, stop;
-	union tdb_attribute seed, log;
-	bool do_stats = false;
-	enum TDB_ERROR ecode;
-
-	/* Try to keep benchmarks even. */
-	seed.base.attr = TDB_ATTRIBUTE_SEED;
-	seed.base.next = NULL;
-	seed.seed.seed = 0;
-
-	log.base.attr = TDB_ATTRIBUTE_LOG;
-	log.base.next = &seed;
-	log.log.fn = tdb_log;
-
-	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
-		flags = TDB_INTERNAL;
-		argc--;
-		argv++;
-	}
-	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
-		transaction = true;
-		argc--;
-		argv++;
-	}
-	if (argv[1] && strcmp(argv[1], "--no-sync") == 0) {
-		flags |= TDB_NOSYNC;
-		argc--;
-		argv++;
-	}
-	if (argv[1] && strcmp(argv[1], "--summary") == 0) {
-		summary = true;
-		argc--;
-		argv++;
-	}
-	if (argv[1] && strcmp(argv[1], "--stats") == 0) {
-		do_stats = true;
-		argc--;
-		argv++;
-	}
-
-	tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC,
-		       0600, &log);
-	if (!tdb)
-		err(1, "Opening /tmp/speed.tdb");
-
-	key.dptr = (void *)&i;
-	key.dsize = sizeof(i);
-	data = key;
-
-	if (argv[1]) {
-		num = atoi(argv[1]);
-		argv++;
-		argc--;
-	}
-
-	if (argv[1]) {
-		stopat = atoi(argv[1]);
-		argv++;
-		argc--;
-	}
-
-	/* Add 1000 records. */
-	printf("Adding %u records: ", num); fflush(stdout);
-	if (transaction && (ecode = tdb_transaction_start(tdb)))
-		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
-	gettimeofday(&start, NULL);
-	for (i = 0; i < num; i++)
-		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
-			errx(1, "Inserting key %u in tdb: %s",
-			     i, tdb_errorstr(ecode));
-	gettimeofday(&stop, NULL);
-	if (transaction && (ecode = tdb_transaction_commit(tdb)))
-		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
-	printf(" %zu ns (%zu bytes)\n",
-	       normalize(&start, &stop, num), file_size());
-
-	if (tdb_check(tdb, NULL, NULL))
-		errx(1, "tdb_check failed!");
-	if (summary) {
-		char *sumstr = NULL;
-		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
-		printf("%s\n", sumstr);
-		free(sumstr);
-	}
-	if (do_stats)
-		dump_and_clear_stats(&tdb, flags, &log);
-
-	if (++stage == stopat)
-		exit(0);
-
-	/* Finding 1000 records. */
-	printf("Finding %u records: ", num); fflush(stdout);
-	if (transaction && (ecode = tdb_transaction_start(tdb)))
-		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
-	gettimeofday(&start, NULL);
-	for (i = 0; i < num; i++) {
-		struct tdb_data dbuf;
-		if ((ecode = tdb_fetch(tdb, key, &dbuf)) != TDB_SUCCESS
-		    || *(int *)dbuf.dptr != i) {
-			errx(1, "Fetching key %u in tdb gave %u",
-			     i, ecode ? ecode : *(int *)dbuf.dptr);
-		}
-	}
-	gettimeofday(&stop, NULL);
-	if (transaction && (ecode = tdb_transaction_commit(tdb)))
-		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
-	printf(" %zu ns (%zu bytes)\n",
-	       normalize(&start, &stop, num), file_size());
-	if (tdb_check(tdb, NULL, NULL))
-		errx(1, "tdb_check failed!");
-	if (summary) {
-		char *sumstr = NULL;
-		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
-		printf("%s\n", sumstr);
-		free(sumstr);
-	}
-	if (do_stats)
-		dump_and_clear_stats(&tdb, flags, &log);
-	if (++stage == stopat)
-		exit(0);
-
-	/* Missing 1000 records. */
-	printf("Missing %u records: ", num); fflush(stdout);
-	if (transaction && (ecode = tdb_transaction_start(tdb)))
-		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
-	gettimeofday(&start, NULL);
-	for (i = num; i < num*2; i++) {
-		struct tdb_data dbuf;
-		ecode = tdb_fetch(tdb, key, &dbuf);
-		if (ecode != TDB_ERR_NOEXIST)
-			errx(1, "Fetching key %u in tdb gave %s",
-			     i, tdb_errorstr(ecode));
-	}
-	gettimeofday(&stop, NULL);
-	if (transaction && (ecode = tdb_transaction_commit(tdb)))
-		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
-	printf(" %zu ns (%zu bytes)\n",
-	       normalize(&start, &stop, num), file_size());
-	if (tdb_check(tdb, NULL, NULL))
-		errx(1, "tdb_check failed!");
-	if (summary) {
-		char *sumstr = NULL;
-		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
-		printf("%s\n", sumstr);
-		free(sumstr);
-	}
-	if (do_stats)
-		dump_and_clear_stats(&tdb, flags, &log);
-	if (++stage == stopat)
-		exit(0);
-
-	/* Traverse 1000 records. */
-	printf("Traversing %u records: ", num); fflush(stdout);
-	if (transaction && (ecode = tdb_transaction_start(tdb)))
-		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
-	i = 0;
-	gettimeofday(&start, NULL);
-	if (tdb_traverse(tdb, count_record, &i) != num)
-		errx(1, "Traverse returned wrong number of records");
-	if (i != (num - 1) * (num / 2))
-		errx(1, "Traverse tallied to %u", i);
-	gettimeofday(&stop, NULL);
-	if (transaction && (ecode = tdb_transaction_commit(tdb)))
-		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
-	printf(" %zu ns (%zu bytes)\n",
-	       normalize(&start, &stop, num), file_size());
-	if (tdb_check(tdb, NULL, NULL))
-		errx(1, "tdb_check failed!");
-	if (summary) {
-		char *sumstr = NULL;
-		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
-		printf("%s\n", sumstr);
-		free(sumstr);
-	}
-	if (do_stats)
-		dump_and_clear_stats(&tdb, flags, &log);
-	if (++stage == stopat)
-		exit(0);
-
-	/* Delete 1000 records (not in order). */
-	printf("Deleting %u records: ", num); fflush(stdout);
-	if (transaction && (ecode = tdb_transaction_start(tdb)))
-		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
-	gettimeofday(&start, NULL);
-	for (j = 0; j < num; j++) {
-		i = (j + 100003) % num;
-		if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS)
-			errx(1, "Deleting key %u in tdb: %s",
-			     i, tdb_errorstr(ecode));
-	}
-	gettimeofday(&stop, NULL);
-	if (transaction && (ecode = tdb_transaction_commit(tdb)))
-		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
-	printf(" %zu ns (%zu bytes)\n",
-	       normalize(&start, &stop, num), file_size());
-	if (tdb_check(tdb, NULL, NULL))
-		errx(1, "tdb_check failed!");
-	if (summary) {
-		char *sumstr = NULL;
-		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
-		printf("%s\n", sumstr);
-		free(sumstr);
-	}
-	if (do_stats)
-		dump_and_clear_stats(&tdb, flags, &log);
-	if (++stage == stopat)
-		exit(0);
-
-	/* Re-add 1000 records (not in order). */
-	printf("Re-adding %u records: ", num); fflush(stdout);
-	if (transaction && (ecode = tdb_transaction_start(tdb)))
-		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
-	gettimeofday(&start, NULL);
-	for (j = 0; j < num; j++) {
-		i = (j + 100003) % num;
-		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
-			errx(1, "Inserting key %u in tdb: %s",
-			     i, tdb_errorstr(ecode));
-	}
-	gettimeofday(&stop, NULL);
-	if (transaction && (ecode = tdb_transaction_commit(tdb)))
-		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
-	printf(" %zu ns (%zu bytes)\n",
-	       normalize(&start, &stop, num), file_size());
-	if (tdb_check(tdb, NULL, NULL))
-		errx(1, "tdb_check failed!");
-	if (summary) {
-		char *sumstr = NULL;
-		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
-		printf("%s\n", sumstr);
-		free(sumstr);
-	}
-	if (do_stats)
-		dump_and_clear_stats(&tdb, flags, &log);
-	if (++stage == stopat)
-		exit(0);
-
-	/* Append 1000 records. */
-	if (transaction && (ecode = tdb_transaction_start(tdb)))
-		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
-	printf("Appending %u records: ", num); fflush(stdout);
-	gettimeofday(&start, NULL);
-	for (i = 0; i < num; i++)
-		if ((ecode = tdb_append(tdb, key, data)) != TDB_SUCCESS)
-			errx(1, "Appending key %u in tdb: %s",
-			     i, tdb_errorstr(ecode));
-	gettimeofday(&stop, NULL);
-	if (transaction && (ecode = tdb_transaction_commit(tdb)))
-		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
-	printf(" %zu ns (%zu bytes)\n",
-	       normalize(&start, &stop, num), file_size());
-	if (tdb_check(tdb, NULL, NULL))
-		errx(1, "tdb_check failed!");
-	if (summary) {
-		char *sumstr = NULL;
-		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
-		printf("%s\n", sumstr);
-		free(sumstr);
-	}
-	if (++stage == stopat)
-		exit(0);
-
-	/* Churn 1000 records: not in order! */
-	if (transaction && (ecode = tdb_transaction_start(tdb)))
-		errx(1, "starting transaction: %s", tdb_errorstr(ecode));
-	printf("Churning %u records: ", num); fflush(stdout);
-	gettimeofday(&start, NULL);
-	for (j = 0; j < num; j++) {
-		i = (j + 1000019) % num;
-		if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS)
-			errx(1, "Deleting key %u in tdb: %s",
-			     i, tdb_errorstr(ecode));
-		i += num;
-		if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
-			errx(1, "Inserting key %u in tdb: %s",
-			     i, tdb_errorstr(ecode));
-	}
-	gettimeofday(&stop, NULL);
-	if (transaction && (ecode = tdb_transaction_commit(tdb)))
-		errx(1, "committing transaction: %s", tdb_errorstr(ecode));
-	printf(" %zu ns (%zu bytes)\n",
-	       normalize(&start, &stop, num), file_size());
-
-	if (tdb_check(tdb, NULL, NULL))
-		errx(1, "tdb_check failed!");
-	if (summary) {
-		char *sumstr = NULL;
-		tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
-		printf("%s\n", sumstr);
-		free(sumstr);
-	}
-	if (do_stats)
-		dump_and_clear_stats(&tdb, flags, &log);
-	if (++stage == stopat)
-		exit(0);
-
-	return 0;
-}
diff --git a/lib/tdb2/tools/tdb2backup.c b/lib/tdb2/tools/tdb2backup.c
deleted file mode 100644
index 37b301c548..0000000000
--- a/lib/tdb2/tools/tdb2backup.c
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
-   Unix SMB/CIFS implementation.
-   low level tdb backup and restore utility
-   Copyright (C) Andrew Tridgell              2002
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/*
-
-  This program is meant for backup/restore of tdb databases. Typical usage would be:
-     tdbbackup *.tdb
-  when Samba shuts down cleanly, which will make a backup of all the local databases
-  to *.bak files. Then on Samba startup you would use:
-     tdbbackup -v *.tdb
-  and this will check the databases for corruption and if corruption is detected then
-  the backup will be restored.
-
-  You may also like to do a backup on a regular basis while Samba is
-  running, perhaps using cron.
-
-  The reason this program is needed is to cope with power failures
-  while Samba is running. A power failure could lead to database
-  corruption and Samba will then not start correctly.
-
-  Note that many of the databases in Samba are transient and thus
-  don't need to be backed up, so you can optimise the above a little
-  by only running the backup on the critical databases.
-
- */
-
-#include "config.h"
-#include "tdb2.h"
-#include "system/filesys.h"
-
-#ifdef HAVE_GETOPT_H
-#include <getopt.h>
-#endif
-
-static int failed;
-
-static void tdb_log(struct tdb_context *tdb,
-		    enum tdb_log_level level,
-		    enum TDB_ERROR ecode,
-		    const char *message,
-		    void *data)
-{
-	fprintf(stderr, "%s:%s\n", tdb_errorstr(ecode), message);
-}
-
-static char *add_suffix(const char *name, const char *suffix)
-{
-	char *ret;
-	int len = strlen(name) + strlen(suffix) + 1;
-	ret = (char *)malloc(len);
-	if (!ret) {
-		fprintf(stderr,"Out of memory!\n");
-		exit(1);
-	}
-	snprintf(ret, len, "%s%s", name, suffix);
-	return ret;
-}
-
-static int copy_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
-{
-	struct tdb_context *tdb_new = (struct tdb_context *)state;
-	enum TDB_ERROR err;
-
-	err = tdb_store(tdb_new, key, dbuf, TDB_INSERT);
-	if (err) {
-		fprintf(stderr,"Failed to insert into %s: %s\n",
-			tdb_name(tdb_new), tdb_errorstr(err));
-		failed = 1;
-		return 1;
-	}
-	return 0;
-}
-
-
-static int test_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
-{
-	return 0;
-}
-
-/*
-  carefully backup a tdb, validating the contents and
-  only doing the backup if its OK
-  this function is also used for restore
-*/
-static int backup_tdb(const char *old_name, const char *new_name)
-{
-	struct tdb_context *tdb;
-	struct tdb_context *tdb_new;
-	char *tmp_name;
-	struct stat st;
-	int count1, count2;
-	enum TDB_ERROR err;
-	union tdb_attribute log_attr;
-
-	tmp_name = add_suffix(new_name, ".tmp");
-
-	/* stat the old tdb to find its permissions */
-	if (stat(old_name, &st) != 0) {
-		perror(old_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	log_attr.base.attr = TDB_ATTRIBUTE_LOG;
-	log_attr.base.next = NULL;
-	log_attr.log.fn = tdb_log;
-
-	/* open the old tdb */
-	tdb = tdb_open(old_name, TDB_DEFAULT, O_RDWR, 0, &log_attr);
-	if (!tdb) {
-		printf("Failed to open %s\n", old_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	unlink(tmp_name);
-	tdb_new = tdb_open(tmp_name, TDB_DEFAULT,
-			   O_RDWR|O_CREAT|O_EXCL, st.st_mode & 0777,
-			   &log_attr);
-	if (!tdb_new) {
-		perror(tmp_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	err = tdb_transaction_start(tdb);
-	if (err) {
-		fprintf(stderr, "Failed to start transaction on old tdb: %s\n",
-			tdb_errorstr(err));
-		tdb_close(tdb);
-		tdb_close(tdb_new);
-		unlink(tmp_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	/* lock the backup tdb so that nobody else can change it */
-	err = tdb_lockall(tdb_new);
-	if (err) {
-		fprintf(stderr, "Failed to lock backup tdb: %s\n",
-			tdb_errorstr(err));
-		tdb_close(tdb);
-		tdb_close(tdb_new);
-		unlink(tmp_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	failed = 0;
-
-	/* traverse and copy */
-	count1 = tdb_traverse(tdb, copy_fn, (void *)tdb_new);
-	if (count1 < 0 || failed) {
-		fprintf(stderr,"failed to copy %s\n", old_name);
-		tdb_close(tdb);
-		tdb_close(tdb_new);
-		unlink(tmp_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	/* close the old tdb */
-	tdb_close(tdb);
-
-	/* copy done, unlock the backup tdb */
-	tdb_unlockall(tdb_new);
-
-#ifdef HAVE_FDATASYNC
-	if (fdatasync(tdb_fd(tdb_new)) != 0) {
-#else
-	if (fsync(tdb_fd(tdb_new)) != 0) {
-#endif
-		/* not fatal */
-		fprintf(stderr, "failed to fsync backup file\n");
-	}
-
-	/* close the new tdb and re-open read-only */
-	tdb_close(tdb_new);
-
-	/* we don't need the hash attr any more */
-	log_attr.base.next = NULL;
-
-	tdb_new = tdb_open(tmp_name, TDB_DEFAULT, O_RDONLY, 0, &log_attr);
-	if (!tdb_new) {
-		fprintf(stderr,"failed to reopen %s\n", tmp_name);
-		unlink(tmp_name);
-		perror(tmp_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	/* traverse the new tdb to confirm */
-	count2 = tdb_traverse(tdb_new, test_fn, NULL);
-	if (count2 != count1) {
-		fprintf(stderr,"failed to copy %s\n", old_name);
-		tdb_close(tdb_new);
-		unlink(tmp_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	/* close the new tdb and rename it to .bak */
-	tdb_close(tdb_new);
-	if (rename(tmp_name, new_name) != 0) {
-		perror(new_name);
-		free(tmp_name);
-		return 1;
-	}
-
-	free(tmp_name);
-
-	return 0;
-}
-
-/*
-  verify a tdb and if it is corrupt then restore from *.bak
-*/
-static int verify_tdb(const char *fname, const char *bak_name)
-{
-	struct tdb_context *tdb;
-	int count = -1;
-	union tdb_attribute log_attr;
-
-	log_attr.base.attr = TDB_ATTRIBUTE_LOG;
-	log_attr.base.next = NULL;
-	log_attr.log.fn = tdb_log;
-
-	/* open the tdb */
-	tdb = tdb_open(fname, TDB_DEFAULT, O_RDONLY, 0, &log_attr);
-
-	/* traverse the tdb, then close it */
-	if (tdb) {
-		count = tdb_traverse(tdb, test_fn, NULL);
-		tdb_close(tdb);
-	}
-
-	/* count is < 0 means an error */
-	if (count < 0) {
-		printf("restoring %s\n", fname);
-		return backup_tdb(bak_name, fname);
-	}
-
-	printf("%s : %d records\n", fname, count);
-
-	return 0;
-}
-
-/*
-  see if one file is newer than another
-*/
-static int file_newer(const char *fname1, const char *fname2)
-{
-	struct stat st1, st2;
-	if (stat(fname1, &st1) != 0) {
-		return 0;
-	}
-	if (stat(fname2, &st2) != 0) {
-		return 1;
-	}
-	return (st1.st_mtime > st2.st_mtime);
-}
-
-static void usage(void)
-{
-	printf("Usage: tdb2backup [options] <fname...>\n\n");
-	printf("   -h            this help message\n");
-	printf("   -v            verify mode (restore if corrupt)\n");
-	printf("   -s suffix     set the backup suffix\n");
-	printf("   -v            verify mode (restore if corrupt)\n");
-}
-
-
- int main(int argc, char *argv[])
-{
-	int i;
-	int ret = 0;
-	int c;
-	int verify = 0;
-	const char *suffix = ".bak";
-
-	while ((c = getopt(argc, argv, "vhs:")) != -1) {
-		switch (c) {
-		case 'h':
-			usage();
-			exit(0);
-		case 'v':
-			verify = 1;
-			break;
-		case 's':
-			suffix = optarg;
-			break;
-		}
-	}
-
-	argc -= optind;
-	argv += optind;
-
-	if (argc < 1) {
-		usage();
-		exit(1);
-	}
-
-	for (i=0; i<argc; i++) {
-		const char *fname = argv[i];
-		char *bak_name;
-
-		bak_name = add_suffix(fname, suffix);
-
-		if (verify) {
-			if (verify_tdb(fname, bak_name) != 0) {
-				ret = 1;
-			}
-		} else {
-			if (file_newer(fname, bak_name) &&
-			    backup_tdb(fname, bak_name) != 0) {
-				ret = 1;
-			}
-		}
-
-		free(bak_name);
-	}
-
-	return ret;
-}
diff --git a/lib/tdb2/tools/tdb2dump.c b/lib/tdb2/tools/tdb2dump.c
deleted file mode 100644
index 40230a2643..0000000000
--- a/lib/tdb2/tools/tdb2dump.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
-   simple tdb2 dump util
-   Copyright (C) Andrew Tridgell              2001
-   Copyright (C) Rusty Russell                2011
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-#include "config.h"
-#include "tdb2.h"
-#ifdef HAVE_LIBREPLACE
-#include <replace.h>
-#include <system/filesys.h>
-#include <system/locale.h>
-#else
-#include <ctype.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <unistd.h>
-#endif
-
-static void print_data(TDB_DATA d)
-{
-	unsigned char *p = (unsigned char *)d.dptr;
-	int len = d.dsize;
-	while (len--) {
-		if (isprint(*p) && !strchr("\"\\", *p)) {
-			fputc(*p, stdout);
-		} else {
-			printf("\\%02X", *p);
-		}
-		p++;
-	}
-}
-
-static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
-{
-	printf("{\n");
-	printf("key(%d) = \"", (int)key.dsize);
-	print_data(key);
-	printf("\"\n");
-	printf("data(%d) = \"", (int)dbuf.dsize);
-	print_data(dbuf);
-	printf("\"\n");
-	printf("}\n");
-	return 0;
-}
-
-static int dump_tdb(const char *fname, const char *keyname)
-{
-	struct tdb_context *tdb;
-	TDB_DATA key, value;
-
-	tdb = tdb_open(fname, 0, O_RDONLY, 0, NULL);
-	if (!tdb) {
-		printf("Failed to open %s\n", fname);
-		return 1;
-	}
-
-	if (!keyname) {
-		tdb_traverse(tdb, traverse_fn, NULL);
-	} else {
-		key = tdb_mkdata(keyname, strlen(keyname));
-		if (tdb_fetch(tdb, key, &value) != 0) {
-			return 1;
-		} else {
-			print_data(value);
-			free(value.dptr);
-		}
-	}
-
-	return 0;
-}
-
-static void usage( void)
-{
-	printf( "Usage: tdb2dump [options] <filename>\n\n");
-	printf( "   -h          this help message\n");
-	printf( "   -k keyname  dumps value of keyname\n");
-}
-
- int main(int argc, char *argv[])
-{
-	char *fname, *keyname=NULL;
-	int c;
-
-	if (argc < 2) {
-		printf("Usage: tdb2dump <fname>\n");
-		exit(1);
-	}
-
-	while ((c = getopt( argc, argv, "hk:")) != -1) {
-		switch (c) {
-		case 'h':
-			usage();
-			exit( 0);
-		case 'k':
-			keyname = optarg;
-			break;
-		default:
-			usage();
-			exit( 1);
-		}
-	}
-
-	fname = argv[optind];
-
-	return dump_tdb(fname, keyname);
-}
diff --git a/lib/tdb2/tools/tdb2restore.c b/lib/tdb2/tools/tdb2restore.c
deleted file mode 100644
index 93c6c8bfe5..0000000000
--- a/lib/tdb2/tools/tdb2restore.c
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
-   tdb2restore -- construct a tdb from tdbdump output.
-   Copyright (C) Volker Lendecke		2010
-   Copyright (C) Simon McVittie			2005
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "config.h"
-#include "tdb2.h"
-#include <assert.h>
-#ifdef HAVE_LIBREPLACE
-#include <replace.h>
-#include <system/filesys.h>
-#else
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#endif
-
-static int read_linehead(FILE *f)
-{
-	int i, c;
-	int num_bytes;
-	char prefix[128];
-
-	while (1) {
-		c = getc(f);
-		if (c == EOF) {
-			return -1;
-		}
-		if (c == '(') {
-			break;
-		}
-	}
-	for (i=0; i<sizeof(prefix); i++) {
-		c = getc(f);
-		if (c == EOF) {
-			return -1;
-		}
-		prefix[i] = c;
-		if (c == '"') {
-			break;
-		}
-	}
-	if (i == sizeof(prefix)) {
-		return -1;
-	}
-	prefix[i] = '\0';
-
-	if (sscanf(prefix, "%d) = ", &num_bytes) != 1) {
-		return -1;
-	}
-	return num_bytes;
-}
-
-static int read_hex(void) {
-	int c;
-	c = getchar();
-	if (c == EOF) {
-		fprintf(stderr, "Unexpected EOF in data\n");
-		return -1;
-	} else if (c == '"') {
-		fprintf(stderr, "Unexpected \\\" sequence\n");
-		return -1;
-	} else if ('0' <= c && c <= '9')  {
-		return c - '0';
-	} else if ('A' <= c && c <= 'F')  {
-		return c - 'A' + 10;
-	} else if ('a' <= c && c <= 'f')  {
-		return c - 'a' + 10;
-	} else {
-		fprintf(stderr, "Invalid hex: %c\n", c);
-		return -1;
-	}
-}
-
-static int read_data(FILE *f, struct tdb_data *d, size_t size) {
-	int c, low, high;
-	int i;
-
-	d->dptr = (unsigned char *)malloc(size);
-	if (d->dptr == NULL) {
-		return -1;
-	}
-	d->dsize = size;
-
-	for (i=0; i<size; i++) {
-		c = getc(f);
-		if (c == EOF) {
-			fprintf(stderr, "Unexpected EOF in data\n");
-			return 1;
-		} else if (c == '"') {
-			return 0;
-		} else if (c == '\\') {
-			high = read_hex();
-			if (high < 0) {
-				return -1;
-			}
-			high = high << 4;
-			assert(high == (high & 0xf0));
-			low = read_hex();
-			if (low < 0) {
-				return -1;
-			}
-			assert(low == (low & 0x0f));
-			d->dptr[i] = (low|high);
-		} else {
-			d->dptr[i] = c;
-		}
-	}
-	return 0;
-}
-
-static int swallow(FILE *f, const char *s, int *eof)
-{
-	char line[128];
-
-	if (fgets(line, sizeof(line), f) == NULL) {
-		if (eof != NULL) {
-			*eof = 1;
-		}
-		return -1;
-	}
-	if (strcmp(line, s) != 0) {
-		return -1;
-	}
-	return 0;
-}
-
-static bool read_rec(FILE *f, struct tdb_context *tdb, int *eof)
-{
-	int length;
-	struct tdb_data key, data;
-	bool ret = false;
-	enum TDB_ERROR e;
-
-	key.dptr = NULL;
-	data.dptr = NULL;
-
-	if (swallow(f, "{\n", eof) == -1) {
-		goto fail;
-	}
-	length = read_linehead(f);
-	if (length == -1) {
-		goto fail;
-	}
-	if (read_data(f, &key, length) == -1) {
-		goto fail;
-	}
-	if (swallow(f, "\"\n", NULL) == -1) {
-		goto fail;
-	}
-	length = read_linehead(f);
-	if (length == -1) {
-		goto fail;
-	}
-	if (read_data(f, &data, length) == -1) {
-		goto fail;
-	}
-	if ((swallow(f, "\"\n", NULL) == -1)
-	    || (swallow(f, "}\n", NULL) == -1)) {
-		goto fail;
-	}
-	e = tdb_store(tdb, key, data, TDB_INSERT);
-	if (e != TDB_SUCCESS) {
-		fprintf(stderr, "TDB error: %s\n", tdb_errorstr(e));
-		goto fail;
-	}
-
-	ret = true;
-fail:
-	free(key.dptr);
-	free(data.dptr);
-	return ret;
-}
-
-static int restore_tdb(const char *fname)
-{
-	struct tdb_context *tdb;
-
-	tdb = tdb_open(fname, 0, O_RDWR|O_CREAT|O_EXCL, 0666, NULL);
-	if (!tdb) {
-		perror("tdb_open");
-		fprintf(stderr, "Failed to open %s\n", fname);
-		return 1;
-	}
-
-	while (1) {
-		int eof = 0;
-		if (!read_rec(stdin, tdb, &eof)) {
-			if (eof) {
-				break;
-			}
-			return 1;
-		}
-	}
-	if (tdb_close(tdb)) {
-		fprintf(stderr, "Error closing tdb\n");
-		return 1;
-	}
-	fprintf(stderr, "EOF\n");
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	char *fname;
-
-	if (argc < 2) {
-		printf("Usage: %s dbname < tdbdump_output\n", argv[0]);
-		exit(1);
-	}
-
-	fname = argv[1];
-
-	return restore_tdb(fname);
-}
diff --git a/lib/tdb2/tools/tdb2tool.c b/lib/tdb2/tools/tdb2tool.c
deleted file mode 100644
index ae20971143..0000000000
--- a/lib/tdb2/tools/tdb2tool.c
+++ /dev/null
@@ -1,810 +0,0 @@
-/*
-   Unix SMB/CIFS implementation.
-   Samba database functions
-   Copyright (C) Andrew Tridgell              1999-2000
-   Copyright (C) Paul `Rusty' Russell		   2000
-   Copyright (C) Jeremy Allison			   2000
-   Copyright (C) Andrew Esh                        2001
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "config.h"
-#include "tdb2.h"
-#ifdef HAVE_LIBREPLACE
-#include <replace.h>
-#include <system/filesys.h>
-#include <system/time.h>
-#include <system/locale.h>
-#else
-#include <stdlib.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <string.h>
-#include <stdarg.h>
-#endif
-
-static int do_command(void);
-const char *cmdname;
-char *arg1, *arg2;
-size_t arg1len, arg2len;
-int bIterate = 0;
-char *line;
-TDB_DATA iterate_kbuf;
-char cmdline[1024];
-static int disable_mmap;
-
-enum commands {
-	CMD_CREATE_TDB,
-	CMD_OPEN_TDB,
-	CMD_TRANSACTION_START,
-	CMD_TRANSACTION_COMMIT,
-	CMD_TRANSACTION_CANCEL,
-	CMD_ERASE,
-	CMD_DUMP,
-	CMD_INSERT,
-	CMD_MOVE,
-	CMD_STORE,
-	CMD_SHOW,
-	CMD_KEYS,
-	CMD_HEXKEYS,
-	CMD_DELETE,
-#if 0
-	CMD_LIST_HASH_FREE,
-	CMD_LIST_FREE,
-#endif
-	CMD_INFO,
-	CMD_MMAP,
-	CMD_SPEED,
-	CMD_FIRST,
-	CMD_NEXT,
-	CMD_SYSTEM,
-	CMD_CHECK,
-	CMD_QUIT,
-	CMD_HELP
-};
-
-typedef struct {
-	const char *name;
-	enum commands cmd;
-} COMMAND_TABLE;
-
-COMMAND_TABLE cmd_table[] = {
-	{"create",	CMD_CREATE_TDB},
-	{"open",	CMD_OPEN_TDB},
-#if 0
-	{"transaction_start",	CMD_TRANSACTION_START},
-	{"transaction_commit",	CMD_TRANSACTION_COMMIT},
-	{"transaction_cancel",	CMD_TRANSACTION_CANCEL},
-#endif
-	{"erase",	CMD_ERASE},
-	{"dump",	CMD_DUMP},
-	{"insert",	CMD_INSERT},
-	{"move",	CMD_MOVE},
-	{"store",	CMD_STORE},
-	{"show",	CMD_SHOW},
-	{"keys",	CMD_KEYS},
-	{"hexkeys",	CMD_HEXKEYS},
-	{"delete",	CMD_DELETE},
-#if 0
-	{"list",	CMD_LIST_HASH_FREE},
-	{"free",	CMD_LIST_FREE},
-#endif
-	{"info",	CMD_INFO},
-	{"speed",	CMD_SPEED},
-	{"mmap",	CMD_MMAP},
-	{"first",	CMD_FIRST},
-	{"1",		CMD_FIRST},
-	{"next",	CMD_NEXT},
-	{"n",		CMD_NEXT},
-	{"check",	CMD_CHECK},
-	{"quit",	CMD_QUIT},
-	{"q",		CMD_QUIT},
-	{"!",		CMD_SYSTEM},
-	{NULL,		CMD_HELP}
-};
-
-struct timeval tp1,tp2;
-
-static void _start_timer(void)
-{
-	gettimeofday(&tp1,NULL);
-}
-
-static double _end_timer(void)
-{
-	gettimeofday(&tp2,NULL);
-	return((tp2.tv_sec - tp1.tv_sec) +
-	       (tp2.tv_usec - tp1.tv_usec)*1.0e-6);
-}
-
-static void tdb_log(struct tdb_context *tdb,
-		    enum tdb_log_level level,
-		    enum TDB_ERROR ecode,
-		    const char *message,
-		    void *data)
-{
-	fprintf(stderr, "tdb:%s:%s:%s\n",
-		tdb_name(tdb), tdb_errorstr(ecode), message);
-}
-
-/* a tdb tool for manipulating a tdb database */
-
-static struct tdb_context *tdb;
-
-static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
-static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
-static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
-
-static void print_asc(const char *buf,int len)
-{
-	int i;
-
-	/* We're probably printing ASCII strings so don't try to display
-	   the trailing NULL character. */
-
-	if (buf[len - 1] == 0)
-	        len--;
-
-	for (i=0;i<len;i++)
-		printf("%c",isprint(buf[i])?buf[i]:'.');
-}
-
-static void print_data(const char *buf,int len)
-{
-	int i=0;
-	if (len<=0) return;
-	printf("[%03X] ",i);
-	for (i=0;i<len;) {
-		printf("%02X ",(int)((unsigned char)buf[i]));
-		i++;
-		if (i%8 == 0) printf(" ");
-		if (i%16 == 0) {
-			print_asc(&buf[i-16],8); printf(" ");
-			print_asc(&buf[i-8],8); printf("\n");
-			if (i<len) printf("[%03X] ",i);
-		}
-	}
-	if (i%16) {
-		int n;
-
-		n = 16 - (i%16);
-		printf(" ");
-		if (n>8) printf(" ");
-		while (n--) printf("   ");
-
-		n = i%16;
-		if (n > 8) n = 8;
-		print_asc(&buf[i-(i%16)],n); printf(" ");
-		n = (i%16) - n;
-		if (n>0) print_asc(&buf[i-n],n);
-		printf("\n");
-	}
-}
-
-static void help(void)
-{
-	printf("\n"
-"tdbtool: \n"
-"  create    dbname     : create a database\n"
-"  open      dbname     : open an existing database\n"
-"  openjh    dbname     : open an existing database (jenkins hash)\n"
-"  transaction_start    : start a transaction\n"
-"  transaction_commit   : commit a transaction\n"
-"  transaction_cancel   : cancel a transaction\n"
-"  erase                : erase the database\n"
-"  dump                 : dump the database as strings\n"
-"  keys                 : dump the database keys as strings\n"
-"  hexkeys              : dump the database keys as hex values\n"
-"  info                 : print summary info about the database\n"
-"  insert    key  data  : insert a record\n"
-"  move      key  file  : move a record to a destination tdb\n"
-"  store     key  data  : store a record (replace)\n"
-"  show      key        : show a record by key\n"
-"  delete    key        : delete a record by key\n"
-#if 0
-"  list                 : print the database hash table and freelist\n"
-"  free                 : print the database freelist\n"
-#endif
-"  check                : check the integrity of an opened database\n"
-"  speed                : perform speed tests on the database\n"
-"  ! command            : execute system command\n"
-"  1 | first            : print the first record\n"
-"  n | next             : print the next record\n"
-"  q | quit             : terminate\n"
-"  \\n                   : repeat 'next' command\n"
-"\n");
-}
-
-static void terror(enum TDB_ERROR err, const char *why)
-{
-	if (err != TDB_SUCCESS)
-		printf("%s:%s\n", tdb_errorstr(err), why);
-	else
-		printf("%s\n", why);
-}
-
-static void create_tdb(const char *tdbname)
-{
-	union tdb_attribute log_attr;
-	log_attr.base.attr = TDB_ATTRIBUTE_LOG;
-	log_attr.base.next = NULL;
-	log_attr.log.fn = tdb_log;
-
-	if (tdb) tdb_close(tdb);
-	tdb = tdb_open(tdbname, (disable_mmap?TDB_NOMMAP:0),
-		       O_RDWR | O_CREAT | O_TRUNC, 0600, &log_attr);
-	if (!tdb) {
-		printf("Could not create %s: %s\n", tdbname, strerror(errno));
-	}
-}
-
-static void open_tdb(const char *tdbname)
-{
-	union tdb_attribute log_attr;
-	log_attr.base.attr = TDB_ATTRIBUTE_LOG;
-	log_attr.base.next = NULL;
-	log_attr.log.fn = tdb_log;
-
-	if (tdb) tdb_close(tdb);
-	tdb = tdb_open(tdbname, disable_mmap?TDB_NOMMAP:0, O_RDWR, 0600,
-		       &log_attr);
-	if (!tdb) {
-		printf("Could not open %s: %s\n", tdbname, strerror(errno));
-	}
-}
-
-static void insert_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
-{
-	TDB_DATA key, dbuf;
-	enum TDB_ERROR ecode;
-
-	if ((keyname == NULL) || (keylen == 0)) {
-		terror(TDB_SUCCESS, "need key");
-		return;
-	}
-
-	key.dptr = (unsigned char *)keyname;
-	key.dsize = keylen;
-	dbuf.dptr = (unsigned char *)data;
-	dbuf.dsize = datalen;
-
-	ecode = tdb_store(tdb, key, dbuf, TDB_INSERT);
-	if (ecode) {
-		terror(ecode, "insert failed");
-	}
-}
-
-static void store_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
-{
-	TDB_DATA key, dbuf;
-	enum TDB_ERROR ecode;
-
-	if ((keyname == NULL) || (keylen == 0)) {
-		terror(TDB_SUCCESS, "need key");
-		return;
-	}
-
-	if ((data == NULL) || (datalen == 0)) {
-		terror(TDB_SUCCESS, "need data");
-		return;
-	}
-
-	key.dptr = (unsigned char *)keyname;
-	key.dsize = keylen;
-	dbuf.dptr = (unsigned char *)data;
-	dbuf.dsize = datalen;
-
-	printf("Storing key:\n");
-	print_rec(tdb, key, dbuf, NULL);
-
-	ecode = tdb_store(tdb, key, dbuf, TDB_REPLACE);
-	if (ecode) {
-		terror(ecode, "store failed");
-	}
-}
-
-static void show_tdb(char *keyname, size_t keylen)
-{
-	TDB_DATA key, dbuf;
-	enum TDB_ERROR ecode;
-
-	if ((keyname == NULL) || (keylen == 0)) {
-		terror(TDB_SUCCESS, "need key");
-		return;
-	}
-
-	key.dptr = (unsigned char *)keyname;
-	key.dsize = keylen;
-
-	ecode = tdb_fetch(tdb, key, &dbuf);
-	if (ecode) {
-		terror(ecode, "fetch failed");
-		return;
-	}
-
-	print_rec(tdb, key, dbuf, NULL);
-
-	free( dbuf.dptr );
-}
-
-static void delete_tdb(char *keyname, size_t keylen)
-{
-	TDB_DATA key;
-	enum TDB_ERROR ecode;
-
-	if ((keyname == NULL) || (keylen == 0)) {
-		terror(TDB_SUCCESS, "need key");
-		return;
-	}
-
-	key.dptr = (unsigned char *)keyname;
-	key.dsize = keylen;
-
-	ecode = tdb_delete(tdb, key);
-	if (ecode) {
-		terror(ecode, "delete failed");
-	}
-}
-
-static void move_rec(char *keyname, size_t keylen, char* tdbname)
-{
-	TDB_DATA key, dbuf;
-	struct tdb_context *dst_tdb;
-	enum TDB_ERROR ecode;
-
-	if ((keyname == NULL) || (keylen == 0)) {
-		terror(TDB_SUCCESS, "need key");
-		return;
-	}
-
-	if ( !tdbname ) {
-		terror(TDB_SUCCESS, "need destination tdb name");
-		return;
-	}
-
-	key.dptr = (unsigned char *)keyname;
-	key.dsize = keylen;
-
-	ecode = tdb_fetch(tdb, key, &dbuf);
-	if (ecode) {
-		terror(ecode, "fetch failed");
-		return;
-	}
-
-	print_rec(tdb, key, dbuf, NULL);
-
-	dst_tdb = tdb_open(tdbname, 0, O_RDWR, 0600, NULL);
-	if ( !dst_tdb ) {
-		terror(TDB_SUCCESS, "unable to open destination tdb");
-		return;
-	}
-
-	ecode = tdb_store( dst_tdb, key, dbuf, TDB_REPLACE);
-	if (ecode)
-		terror(ecode, "failed to move record");
-	else
-		printf("record moved\n");
-
-	tdb_close( dst_tdb );
-}
-
-static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
-{
-	printf("\nkey %d bytes\n", (int)key.dsize);
-	print_asc((const char *)key.dptr, key.dsize);
-	printf("\ndata %d bytes\n", (int)dbuf.dsize);
-	print_data((const char *)dbuf.dptr, dbuf.dsize);
-	return 0;
-}
-
-static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
-{
-	printf("key %d bytes: ", (int)key.dsize);
-	print_asc((const char *)key.dptr, key.dsize);
-	printf("\n");
-	return 0;
-}
-
-static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
-{
-	printf("key %d bytes\n", (int)key.dsize);
-	print_data((const char *)key.dptr, key.dsize);
-	printf("\n");
-	return 0;
-}
-
-static int total_bytes;
-
-static int traverse_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
-{
-	total_bytes += dbuf.dsize;
-	return 0;
-}
-
-static void info_tdb(void)
-{
-	enum TDB_ERROR ecode;
-	char *summary;
-
-	ecode = tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &summary);
-
-	if (ecode) {
-		terror(ecode, "Getting summary");
-	} else {
-		printf("%s", summary);
-		free(summary);
-	}
-}
-
-static void speed_tdb(const char *tlimit)
-{
-	unsigned timelimit = tlimit?atoi(tlimit):0;
-	double t;
-	int ops;
-	if (timelimit == 0) timelimit = 5;
-
-	ops = 0;
-	printf("Testing store speed for %u seconds\n", timelimit);
-	_start_timer();
-	do {
-		long int r = random();
-		TDB_DATA key, dbuf;
-		key = tdb_mkdata("store test", strlen("store test"));
-		dbuf.dptr = (unsigned char *)&r;
-		dbuf.dsize = sizeof(r);
-		tdb_store(tdb, key, dbuf, TDB_REPLACE);
-		t = _end_timer();
-		ops++;
-	} while (t < timelimit);
-	printf("%10.3f ops/sec\n", ops/t);
-
-	ops = 0;
-	printf("Testing fetch speed for %u seconds\n", timelimit);
-	_start_timer();
-	do {
-		long int r = random();
-		TDB_DATA key, dbuf;
-		key = tdb_mkdata("store test", strlen("store test"));
-		dbuf.dptr = (unsigned char *)&r;
-		dbuf.dsize = sizeof(r);
-		tdb_fetch(tdb, key, &dbuf);
-		t = _end_timer();
-		ops++;
-	} while (t < timelimit);
-	printf("%10.3f ops/sec\n", ops/t);
-
-	ops = 0;
-	printf("Testing transaction speed for %u seconds\n", timelimit);
-	_start_timer();
-	do {
-		long int r = random();
-		TDB_DATA key, dbuf;
-		key = tdb_mkdata("transaction test", strlen("transaction test"));
-		dbuf.dptr = (unsigned char *)&r;
-		dbuf.dsize = sizeof(r);
-		tdb_transaction_start(tdb);
-		tdb_store(tdb, key, dbuf, TDB_REPLACE);
-		tdb_transaction_commit(tdb);
-		t = _end_timer();
-		ops++;
-	} while (t < timelimit);
-	printf("%10.3f ops/sec\n", ops/t);
-
-	ops = 0;
-	printf("Testing traverse speed for %u seconds\n", timelimit);
-	_start_timer();
-	do {
-		tdb_traverse(tdb, traverse_fn, NULL);
-		t = _end_timer();
-		ops++;
-	} while (t < timelimit);
-	printf("%10.3f ops/sec\n", ops/t);
-}
-
-static void toggle_mmap(void)
-{
-	disable_mmap = !disable_mmap;
-	if (disable_mmap) {
-		printf("mmap is disabled\n");
-	} else {
-		printf("mmap is enabled\n");
-	}
-}
-
-static char *tdb_getline(const char *prompt)
-{
-	static char thisline[1024];
-	char *p;
-	fputs(prompt, stdout);
-	thisline[0] = 0;
-	p = fgets(thisline, sizeof(thisline)-1, stdin);
-	if (p) p = strchr(p, '\n');
-	if (p) *p = 0;
-	return p?thisline:NULL;
-}
-
-static int do_delete_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf,
-                     void *state)
-{
-    return tdb_delete(the_tdb, key);
-}
-
-static void first_record(struct tdb_context *the_tdb, TDB_DATA *pkey)
-{
-	TDB_DATA dbuf;
-	enum TDB_ERROR ecode;
-	ecode = tdb_firstkey(the_tdb, pkey);
-	if (!ecode)
-		ecode = tdb_fetch(the_tdb, *pkey, &dbuf);
-	if (ecode) terror(ecode, "fetch failed");
-	else {
-		print_rec(the_tdb, *pkey, dbuf, NULL);
-	}
-}
-
-static void next_record(struct tdb_context *the_tdb, TDB_DATA *pkey)
-{
-	TDB_DATA dbuf;
-	enum TDB_ERROR ecode;
-	ecode = tdb_nextkey(the_tdb, pkey);
-
-	if (!ecode)
-		ecode = tdb_fetch(the_tdb, *pkey, &dbuf);
-	if (ecode)
-		terror(ecode, "fetch failed");
-	else
-		print_rec(the_tdb, *pkey, dbuf, NULL);
-}
-
-static void check_db(struct tdb_context *the_tdb)
-{
-	if (!the_tdb) {
-		printf("Error: No database opened!\n");
-	} else {
-		if (tdb_check(the_tdb, NULL, NULL) != 0)
-			printf("Integrity check for the opened database failed.\n");
-		else
-			printf("Database integrity is OK.\n");
-	}
-}
-
-static int do_command(void)
-{
-	COMMAND_TABLE *ctp = cmd_table;
-	enum commands mycmd = CMD_HELP;
-	int cmd_len;
-
-	if (cmdname && strlen(cmdname) == 0) {
-		mycmd = CMD_NEXT;
-	} else {
-		while (ctp->name) {
-			cmd_len = strlen(ctp->name);
-			if (strncmp(ctp->name,cmdname,cmd_len) == 0) {
-				mycmd = ctp->cmd;
-				break;
-			}
-			ctp++;
-		}
-	}
-
-	switch (mycmd) {
-	case CMD_CREATE_TDB:
-		bIterate = 0;
-		create_tdb(arg1);
-		return 0;
-	case CMD_OPEN_TDB:
-		bIterate = 0;
-		open_tdb(arg1);
-		return 0;
-	case CMD_SYSTEM:
-		/* Shell command */
-		if (system(arg1) == -1) {
-			terror(TDB_SUCCESS, "system() call failed\n");
-		}
-		return 0;
-	case CMD_QUIT:
-		return 1;
-	default:
-		/* all the rest require a open database */
-		if (!tdb) {
-			bIterate = 0;
-			terror(TDB_SUCCESS, "database not open");
-			help();
-			return 0;
-		}
-		switch (mycmd) {
-		case CMD_TRANSACTION_START:
-			bIterate = 0;
-			tdb_transaction_start(tdb);
-			return 0;
-		case CMD_TRANSACTION_COMMIT:
-			bIterate = 0;
-			tdb_transaction_commit(tdb);
-			return 0;
-		case CMD_TRANSACTION_CANCEL:
-			bIterate = 0;
-			tdb_transaction_cancel(tdb);
-			return 0;
-		case CMD_ERASE:
-			bIterate = 0;
-			tdb_traverse(tdb, do_delete_fn, NULL);
-			return 0;
-		case CMD_DUMP:
-			bIterate = 0;
-			tdb_traverse(tdb, print_rec, NULL);
-			return 0;
-		case CMD_INSERT:
-			bIterate = 0;
-			insert_tdb(arg1, arg1len,arg2,arg2len);
-			return 0;
-		case CMD_MOVE:
-			bIterate = 0;
-			move_rec(arg1,arg1len,arg2);
-			return 0;
-		case CMD_STORE:
-			bIterate = 0;
-			store_tdb(arg1,arg1len,arg2,arg2len);
-			return 0;
-		case CMD_SHOW:
-			bIterate = 0;
-			show_tdb(arg1, arg1len);
-			return 0;
-		case CMD_KEYS:
-			tdb_traverse(tdb, print_key, NULL);
-			return 0;
-		case CMD_HEXKEYS:
-			tdb_traverse(tdb, print_hexkey, NULL);
-			return 0;
-		case CMD_DELETE:
-			bIterate = 0;
-			delete_tdb(arg1,arg1len);
-			return 0;
-#if 0
-		case CMD_LIST_HASH_FREE:
-			tdb_dump_all(tdb);
-			return 0;
-		case CMD_LIST_FREE:
-			tdb_printfreelist(tdb);
-			return 0;
-#endif
-		case CMD_INFO:
-			info_tdb();
-			return 0;
-		case CMD_SPEED:
-			speed_tdb(arg1);
-			return 0;
-		case CMD_MMAP:
-			toggle_mmap();
-			return 0;
-		case CMD_FIRST:
-			bIterate = 1;
-			first_record(tdb, &iterate_kbuf);
-			return 0;
-		case CMD_NEXT:
-			if (bIterate)
-				next_record(tdb, &iterate_kbuf);
-			return 0;
-		case CMD_CHECK:
-			check_db(tdb);
-			return 0;
-		case CMD_HELP:
-			help();
-			return 0;
-		case CMD_CREATE_TDB:
-		case CMD_OPEN_TDB:
-		case CMD_SYSTEM:
-		case CMD_QUIT:
-			/*
-			 * unhandled commands.  cases included here to avoid compiler
-			 * warnings.
-			 */
-			return 0;
-		}
-	}
-
-	return 0;
-}
-
-static char *convert_string(char *instring, size_t *sizep)
-{
-	size_t length = 0;
-	char *outp, *inp;
-	char temp[3];
-
-	outp = inp = instring;
-
-	while (*inp) {
-		if (*inp == '\\') {
-			inp++;
-			if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
-				temp[0] = *inp++;
-				temp[1] = '\0';
-				if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
-					temp[1] = *inp++;
-					temp[2] = '\0';
-				}
-				*outp++ = (char)strtol((const char *)temp,NULL,16);
-			} else {
-				*outp++ = *inp++;
-			}
-		} else {
-			*outp++ = *inp++;
-		}
-		length++;
-	}
-	*sizep = length;
-	return instring;
-}
-
-int main(int argc, char *argv[])
-{
-	cmdname = "";
-	arg1 = NULL;
-	arg1len = 0;
-	arg2 = NULL;
-	arg2len = 0;
-
-	if (argv[1]) {
-		cmdname = "open";
-		arg1 = argv[1];
-		do_command();
-		cmdname =  "";
-		arg1 = NULL;
-	}
-
-	switch (argc) {
-	case 1:
-	case 2:
-		/* Interactive mode */
-		while ((cmdname = tdb_getline("tdb> "))) {
-			arg2 = arg1 = NULL;
-			if ((arg1 = strchr((const char *)cmdname,' ')) != NULL) {
-				arg1++;
-				arg2 = arg1;
-				while (*arg2) {
-					if (*arg2 == ' ') {
-						*arg2++ = '\0';
-						break;
-					}
-					if ((*arg2++ == '\\') && (*arg2 == ' ')) {
-						arg2++;
-					}
-				}
-			}
-			if (arg1) arg1 = convert_string(arg1,&arg1len);
-			if (arg2) arg2 = convert_string(arg2,&arg2len);
-			if (do_command()) break;
-		}
-		break;
-	case 5:
-		arg2 = convert_string(argv[4],&arg2len);
-	case 4:
-		arg1 = convert_string(argv[3],&arg1len);
-	case 3:
-		cmdname = argv[2];
-	default:
-		do_command();
-		break;
-	}
-
-	if (tdb) tdb_close(tdb);
-
-	return 0;
-}
diff --git a/lib/tdb2/tools/tdb2torture.c b/lib/tdb2/tools/tdb2torture.c
deleted file mode 100644
index 73e2e29874..0000000000
--- a/lib/tdb2/tools/tdb2torture.c
+++ /dev/null
@@ -1,529 +0,0 @@
-/* this tests tdb by doing lots of ops from several simultaneous
-   writers - that stresses the locking code.
-*/
-
-#include "config.h"
-#include "tdb2.h"
-#include <ccan/err/err.h>
-#ifdef HAVE_LIBREPLACE
-#include <replace.h>
-#else
-#include <stdlib.h>
-#include <getopt.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <fcntl.h>
-#include <time.h>
-#include <sys/wait.h>
-#endif
-
-//#define REOPEN_PROB 30
-#define DELETE_PROB 8
-#define STORE_PROB 4
-#define APPEND_PROB 6
-#define TRANSACTION_PROB 10
-#define TRANSACTION_PREPARE_PROB 2
-#define LOCKSTORE_PROB 5
-#define TRAVERSE_PROB 20
-#define TRAVERSE_MOD_PROB 100
-#define TRAVERSE_ABORT_PROB 500
-#define CULL_PROB 100
-#define KEYLEN 3
-#define DATALEN 100
-
-static struct tdb_context *db;
-static int in_transaction;
-static int in_traverse;
-static int error_count;
-#if TRANSACTION_PROB
-static int always_transaction = 0;
-#endif
-static int loopnum;
-static int count_pipe;
-static union tdb_attribute log_attr;
-static union tdb_attribute seed_attr;
-
-static void tdb_log(struct tdb_context *tdb,
-		    enum tdb_log_level level,
-		    enum TDB_ERROR ecode,
-		    const char *message,
-		    void *data)
-{
-	printf("tdb:%s:%s:%s\n",
-	       tdb_name(tdb), tdb_errorstr(ecode), message);
-	fflush(stdout);
-#if 0
-	{
-		char str[200];
-		signal(SIGUSR1, SIG_IGN);
-		sprintf(str,"xterm -e gdb /proc/%d/exe %d", getpid(), getpid());
-		system(str);
-	}
-#endif
-}
-
-#include "../private.h"
-
-static void segv_handler(int sig, siginfo_t *info, void *p)
-{
-	char string[100];
-
-	sprintf(string, "%u: death at %p (map_ptr %p, map_size %zu)\n",
-		getpid(), info->si_addr, db->file->map_ptr,
-		(size_t)db->file->map_size);
-	if (write(2, string, strlen(string)) > 0)
-		sleep(60);
-	_exit(11);
-}
-
-static void fatal(struct tdb_context *tdb, const char *why)
-{
-	fprintf(stderr, "%u:%s:%s\n", getpid(), why,
-		tdb ? tdb_errorstr(tdb_error(tdb)) : "(no tdb)");
-	error_count++;
-}
-
-static char *randbuf(int len)
-{
-	char *buf;
-	int i;
-	buf = (char *)malloc(len+1);
-
-	for (i=0;i<len;i++) {
-		buf[i] = 'a' + (rand() % 26);
-	}
-	buf[i] = 0;
-	return buf;
-}
-
-static void addrec_db(void);
-static int modify_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
-			   void *state)
-{
-#if CULL_PROB
-	if (random() % CULL_PROB == 0) {
-		tdb_delete(tdb, key);
-	}
-#endif
-
-#if TRAVERSE_MOD_PROB
-	if (random() % TRAVERSE_MOD_PROB == 0) {
-		addrec_db();
-	}
-#endif
-
-#if TRAVERSE_ABORT_PROB
-	if (random() % TRAVERSE_ABORT_PROB == 0)
-		return 1;
-#endif
-
-	return 0;
-}
-
-static void addrec_db(void)
-{
-	int klen, dlen;
-	char *k, *d;
-	TDB_DATA key, data;
-
-	klen = 1 + (rand() % KEYLEN);
-	dlen = 1 + (rand() % DATALEN);
-
-	k = randbuf(klen);
-	d = randbuf(dlen);
-
-	key.dptr = (unsigned char *)k;
-	key.dsize = klen+1;
-
-	data.dptr = (unsigned char *)d;
-	data.dsize = dlen+1;
-
-#if REOPEN_PROB
-	if (in_traverse == 0 && in_transaction == 0 && random() % REOPEN_PROB == 0) {
-		tdb_reopen_all(0);
-		goto next;
-	}
-#endif
-
-#if TRANSACTION_PROB
-	if (in_traverse == 0 && in_transaction == 0 && (always_transaction || random() % TRANSACTION_PROB == 0)) {
-		if (tdb_transaction_start(db) != 0) {
-			fatal(db, "tdb_transaction_start failed");
-		}
-		in_transaction++;
-		goto next;
-	}
-	if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) {
-		if (random() % TRANSACTION_PREPARE_PROB == 0) {
-			if (tdb_transaction_prepare_commit(db) != 0) {
-				fatal(db, "tdb_transaction_prepare_commit failed");
-			}
-		}
-		if (tdb_transaction_commit(db) != 0) {
-			fatal(db, "tdb_transaction_commit failed");
-		}
-		in_transaction--;
-		goto next;
-	}
-
-	if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) {
-		tdb_transaction_cancel(db);
-		in_transaction--;
-		goto next;
-	}
-#endif
-
-#if DELETE_PROB
-	if (random() % DELETE_PROB == 0) {
-		tdb_delete(db, key);
-		goto next;
-	}
-#endif
-
-#if STORE_PROB
-	if (random() % STORE_PROB == 0) {
-		if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
-			fatal(db, "tdb_store failed");
-		}
-		goto next;
-	}
-#endif
-
-#if APPEND_PROB
-	if (random() % APPEND_PROB == 0) {
-		if (tdb_append(db, key, data) != 0) {
-			fatal(db, "tdb_append failed");
-		}
-		goto next;
-	}
-#endif
-
-#if LOCKSTORE_PROB
-	if (random() % LOCKSTORE_PROB == 0) {
-		tdb_chainlock(db, key);
-		if (tdb_fetch(db, key, &data) != TDB_SUCCESS) {
-			data.dsize = 0;
-			data.dptr = NULL;
-		}
-		if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
-			fatal(db, "tdb_store failed");
-		}
-		if (data.dptr) free(data.dptr);
-		tdb_chainunlock(db, key);
-		goto next;
-	}
-#endif
-
-#if TRAVERSE_PROB
-	/* FIXME: recursive traverses break transactions? */
-	if (in_traverse == 0 && random() % TRAVERSE_PROB == 0) {
-		in_traverse++;
-		tdb_traverse(db, modify_traverse, NULL);
-		in_traverse--;
-		goto next;
-	}
-#endif
-
-	if (tdb_fetch(db, key, &data) == TDB_SUCCESS)
-		free(data.dptr);
-
-next:
-	free(k);
-	free(d);
-}
-
-static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
-                       void *state)
-{
-	tdb_delete(tdb, key);
-	return 0;
-}
-
-static void usage(void)
-{
-	printf("Usage: tdb2torture"
-#if TRANSACTION_PROB
-	       " [-t]"
-#endif
-	       " [-k] [-n NUM_PROCS] [-l NUM_LOOPS] [-s SEED] [-S]\n");
-	exit(0);
-}
-
-static void send_count_and_suicide(int sig)
-{
-	/* This ensures our successor can continue where we left off. */
-	if (write(count_pipe, &loopnum, sizeof(loopnum)) != sizeof(loopnum))
-		exit(2);
-	/* This gives a unique signature. */
-	kill(getpid(), SIGUSR2);
-}
-
-static int run_child(const char *filename, int i, int seed, unsigned num_loops,
-		     unsigned start, int tdb_flags)
-{
-	struct sigaction act = { .sa_sigaction = segv_handler,
-				 .sa_flags = SA_SIGINFO };
-	sigaction(11, &act, NULL);
-
-	db = tdb_open(filename, tdb_flags, O_RDWR | O_CREAT, 0600,
-		      &log_attr);
-	if (!db) {
-		fatal(NULL, "db open failed");
-	}
-
-#if 0
-	if (i == 0) {
-		printf("pid %i\n", getpid());
-		sleep(9);
-	} else
-		sleep(10);
-#endif
-
-	srand(seed + i);
-	srandom(seed + i);
-
-	/* Set global, then we're ready to handle being killed. */
-	loopnum = start;
-	signal(SIGUSR1, send_count_and_suicide);
-
-	for (;loopnum<num_loops && error_count == 0;loopnum++) {
-		addrec_db();
-	}
-
-	if (error_count == 0) {
-		tdb_traverse(db, NULL, NULL);
-#if TRANSACTION_PROB
-		if (always_transaction) {
-			while (in_transaction) {
-				tdb_transaction_cancel(db);
-				in_transaction--;
-			}
-			if (tdb_transaction_start(db) != 0)
-				fatal(db, "tdb_transaction_start failed");
-		}
-#endif
-		tdb_traverse(db, traverse_fn, NULL);
-		tdb_traverse(db, traverse_fn, NULL);
-
-#if TRANSACTION_PROB
-		if (always_transaction) {
-			if (tdb_transaction_commit(db) != 0)
-				fatal(db, "tdb_transaction_commit failed");
-		}
-#endif
-	}
-
-	tdb_close(db);
-
-	return (error_count < 100 ? error_count : 100);
-}
-
-static char *test_path(const char *filename)
-{
-	const char *prefix = getenv("TEST_DATA_PREFIX");
-
-	if (prefix) {
-		char *path = NULL;
-		int ret;
-
-		ret = asprintf(&path, "%s/%s", prefix, filename);
-		if (ret == -1) {
-			return NULL;
-		}
-		return path;
-	}
-
-	return strdup(filename);
-}
-
-int main(int argc, char * const *argv)
-{
-	int i, seed = -1;
-	int num_loops = 5000;
-	int num_procs = 3;
-	int c, pfds[2];
-	extern char *optarg;
-	pid_t *pids;
-	int kill_random = 0;
-	int *done;
-	int tdb_flags = TDB_DEFAULT;
-	char *test_tdb;
-
-	log_attr.base.attr = TDB_ATTRIBUTE_LOG;
-	log_attr.base.next = &seed_attr;
-	log_attr.log.fn = tdb_log;
-	seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
-	seed_attr.base.next = NULL;
-
-	while ((c = getopt(argc, argv, "n:l:s:thkS")) != -1) {
-		switch (c) {
-		case 'n':
-			num_procs = strtol(optarg, NULL, 0);
-			break;
-		case 'l':
-			num_loops = strtol(optarg, NULL, 0);
-			break;
-		case 's':
-			seed = strtol(optarg, NULL, 0);
-			break;
-		case 'S':
-			tdb_flags = TDB_NOSYNC;
-			break;
-		case 't':
-#if TRANSACTION_PROB
-			always_transaction = 1;
-#else
-			fprintf(stderr, "Transactions not supported\n");
-			usage();
-#endif
-			break;
-		case 'k':
-			kill_random = 1;
-			break;
-		default:
-			usage();
-		}
-	}
-
-	test_tdb = test_path("torture.tdb2");
-
-	unlink(test_tdb);
-
-	if (seed == -1) {
-		seed = (getpid() + time(NULL)) & 0x7FFFFFFF;
-	}
-	seed_attr.seed.seed = (((uint64_t)seed) << 32) | seed;
-
-	if (num_procs == 1 && !kill_random) {
-		/* Don't fork for this case, makes debugging easier. */
-		error_count = run_child(test_tdb, 0, seed, num_loops, 0,
-					tdb_flags);
-		goto done;
-	}
-
-	pids = (pid_t *)calloc(sizeof(pid_t), num_procs);
-	done = (int *)calloc(sizeof(int), num_procs);
-
-	if (pipe(pfds) != 0) {
-		perror("Creating pipe");
-		exit(1);
-	}
-	count_pipe = pfds[1];
-
-	for (i=0;i<num_procs;i++) {
-		if ((pids[i]=fork()) == 0) {
-			close(pfds[0]);
-			if (i == 0) {
-				printf("testing with %d processes, %d loops, seed=%d%s\n",
-				       num_procs, num_loops, seed,
-#if TRANSACTION_PROB
-				       always_transaction ? " (all within transactions)" : ""
-#else
-				       ""
-#endif
-					);
-			}
-			exit(run_child(test_tdb, i, seed, num_loops, 0,
-				       tdb_flags));
-		}
-	}
-
-	while (num_procs) {
-		int status, j;
-		pid_t pid;
-
-		if (error_count != 0) {
-			/* try and stop the test on any failure */
-			for (j=0;j<num_procs;j++) {
-				if (pids[j] != 0) {
-					kill(pids[j], SIGTERM);
-				}
-			}
-		}
-
-		pid = waitpid(-1, &status, kill_random ? WNOHANG : 0);
-		if (pid == 0) {
-			struct timespec ts;
-
-			/* Sleep for 1/10 second. */
-			ts.tv_sec = 0;
-			ts.tv_nsec = 100000000;
-			nanosleep(&ts, NULL);
-
-			/* Kill someone. */
-			kill(pids[random() % num_procs], SIGUSR1);
-			continue;
-		}
-
-		if (pid == -1) {
-			perror("failed to wait for child\n");
-			exit(1);
-		}
-
-		for (j=0;j<num_procs;j++) {
-			if (pids[j] == pid) break;
-		}
-		if (j == num_procs) {
-			printf("unknown child %d exited!?\n", (int)pid);
-			exit(1);
-		}
-		if (WIFSIGNALED(status)) {
-			if (WTERMSIG(status) == SIGUSR2
-			    || WTERMSIG(status) == SIGUSR1) {
-				/* SIGUSR2 means they wrote to pipe. */
-				if (WTERMSIG(status) == SIGUSR2) {
-					if (read(pfds[0], &done[j],
-						 sizeof(done[j]))
-					    != sizeof(done[j]))
-						err(1,
-						    "Short read from child?");
-				}
-				pids[j] = fork();
-				if (pids[j] == 0)
-					exit(run_child(test_tdb, j, seed,
-						       num_loops, done[j],
-						       tdb_flags));
-				printf("Restarting child %i for %u-%u\n",
-				       j, done[j], num_loops);
-				continue;
-			}
-			printf("child %d exited with signal %d\n",
-			       (int)pid, WTERMSIG(status));
-			error_count++;
-		} else {
-			if (WEXITSTATUS(status) != 0) {
-				printf("child %d exited with status %d\n",
-				       (int)pid, WEXITSTATUS(status));
-				error_count++;
-			}
-		}
-		memmove(&pids[j], &pids[j+1],
-			(num_procs - j - 1)*sizeof(pids[0]));
-		num_procs--;
-	}
-
-	free(pids);
-
-done:
-	if (error_count == 0) {
-		db = tdb_open(test_tdb, TDB_DEFAULT, O_RDWR | O_CREAT,
-			      0600, &log_attr);
-		if (!db) {
-			fatal(db, "db open failed");
-			exit(1);
-		}
-		if (tdb_check(db, NULL, NULL) != 0) {
-			fatal(db, "db check failed");
-			exit(1);
-		}
-		tdb_close(db);
-		printf("OK\n");
-	}
-
-	free(test_tdb);
-	return error_count;
-}
diff --git a/lib/tdb2/transaction.c b/lib/tdb2/transaction.c
deleted file mode 100644
index 2b714714dc..0000000000
--- a/lib/tdb2/transaction.c
+++ /dev/null
@@ -1,1322 +0,0 @@
- /*
-   Unix SMB/CIFS implementation.
-
-   trivial database library
-
-   Copyright (C) Andrew Tridgell              2005
-   Copyright (C) Rusty Russell                2010
-
-     ** NOTE! The following LGPL license applies to the tdb
-     ** library. This does NOT imply that all of Samba is released
-     ** under the LGPL
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "private.h"
-#define SAFE_FREE(x) do { if ((x) != NULL) {free((void *)x); (x)=NULL;} } while(0)
-
-/*
-  transaction design:
-
-  - only allow a single transaction at a time per database. This makes
-    using the transaction API simpler, as otherwise the caller would
-    have to cope with temporary failures in transactions that conflict
-    with other current transactions
-
-  - keep the transaction recovery information in the same file as the
-    database, using a special 'transaction recovery' record pointed at
-    by the header. This removes the need for extra journal files as
-    used by some other databases
-
-  - dynamically allocated the transaction recover record, re-using it
-    for subsequent transactions. If a larger record is needed then
-    tdb_free() the old record to place it on the normal tdb freelist
-    before allocating the new record
-
-  - during transactions, keep a linked list of writes all that have
-    been performed by intercepting all tdb_write() calls. The hooked
-    transaction versions of tdb_read() and tdb_write() check this
-    linked list and try to use the elements of the list in preference
-    to the real database.
-
-  - don't allow any locks to be held when a transaction starts,
-    otherwise we can end up with deadlock (plus lack of lock nesting
-    in POSIX locks would mean the lock is lost)
-
-  - if the caller gains a lock during the transaction but doesn't
-    release it then fail the commit
-
-  - allow for nested calls to tdb_transaction_start(), re-using the
-    existing transaction record. If the inner transaction is canceled
-    then a subsequent commit will fail
-
-  - keep a mirrored copy of the tdb hash chain heads to allow for the
-    fast hash heads scan on traverse, updating the mirrored copy in
-    the transaction version of tdb_write
-
-  - allow callers to mix transaction and non-transaction use of tdb,
-    although once a transaction is started then an exclusive lock is
-    gained until the transaction is committed or canceled
-
-  - the commit stategy involves first saving away all modified data
-    into a linearised buffer in the transaction recovery area, then
-    marking the transaction recovery area with a magic value to
-    indicate a valid recovery record. In total 4 fsync/msync calls are
-    needed per commit to prevent race conditions. It might be possible
-    to reduce this to 3 or even 2 with some more work.
-
-  - check for a valid recovery record on open of the tdb, while the
-    open lock is held. Automatically recover from the transaction
-    recovery area if needed, then continue with the open as
-    usual. This allows for smooth crash recovery with no administrator
-    intervention.
-
-  - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
-    still available, but no transaction recovery area is used and no
-    fsync/msync calls are made.
-*/
-
-/*
-  hold the context of any current transaction
-*/
-struct tdb_transaction {
-	/* the original io methods - used to do IOs to the real db */
-	const struct tdb_methods *io_methods;
-
-	/* the list of transaction blocks. When a block is first
-	   written to, it gets created in this list */
-	uint8_t **blocks;
-	size_t num_blocks;
-	size_t last_block_size; /* number of valid bytes in the last block */
-
-	/* non-zero when an internal transaction error has
-	   occurred. All write operations will then fail until the
-	   transaction is ended */
-	int transaction_error;
-
-	/* when inside a transaction we need to keep track of any
-	   nested tdb_transaction_start() calls, as these are allowed,
-	   but don't create a new transaction */
-	unsigned int nesting;
-
-	/* set when a prepare has already occurred */
-	bool prepared;
-	tdb_off_t magic_offset;
-
-	/* old file size before transaction */
-	tdb_len_t old_map_size;
-};
-
-/* This doesn't really need to be pagesize, but we use it for similar reasons. */
-#define PAGESIZE 65536
-
-/*
-  read while in a transaction. We need to check first if the data is in our list
-  of transaction elements, then if not do a real read
-*/
-static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off,
-				       void *buf, tdb_len_t len)
-{
-	size_t blk;
-	enum TDB_ERROR ecode;
-
-	/* break it down into block sized ops */
-	while (len + (off % PAGESIZE) > PAGESIZE) {
-		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
-		ecode = transaction_read(tdb, off, buf, len2);
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-		len -= len2;
-		off += len2;
-		buf = (void *)(len2 + (char *)buf);
-	}
-
-	if (len == 0) {
-		return TDB_SUCCESS;
-	}
-
-	blk = off / PAGESIZE;
-
-	/* see if we have it in the block list */
-	if (tdb->transaction->num_blocks <= blk ||
-	    tdb->transaction->blocks[blk] == NULL) {
-		/* nope, do a real read */
-		ecode = tdb->transaction->io_methods->tread(tdb, off, buf, len);
-		if (ecode != TDB_SUCCESS) {
-			goto fail;
-		}
-		return 0;
-	}
-
-	/* it is in the block list. Now check for the last block */
-	if (blk == tdb->transaction->num_blocks-1) {
-		if (len > tdb->transaction->last_block_size) {
-			ecode = TDB_ERR_IO;
-			goto fail;
-		}
-	}
-
-	/* now copy it out of this block */
-	memcpy(buf, tdb->transaction->blocks[blk] + (off % PAGESIZE), len);
-	return TDB_SUCCESS;
-
-fail:
-	tdb->transaction->transaction_error = 1;
-	return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-			  "transaction_read: failed at off=%zu len=%zu",
-			  (size_t)off, (size_t)len);
-}
-
-
-/*
-  write while in a transaction
-*/
-static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off,
-					const void *buf, tdb_len_t len)
-{
-	size_t blk;
-	enum TDB_ERROR ecode;
-
-	/* Only a commit is allowed on a prepared transaction */
-	if (tdb->transaction->prepared) {
-		ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
-				   "transaction_write: transaction already"
-				   " prepared, write not allowed");
-		goto fail;
-	}
-
-	/* break it up into block sized chunks */
-	while (len + (off % PAGESIZE) > PAGESIZE) {
-		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
-		ecode = transaction_write(tdb, off, buf, len2);
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-		len -= len2;
-		off += len2;
-		if (buf != NULL) {
-			buf = (const void *)(len2 + (const char *)buf);
-		}
-	}
-
-	if (len == 0) {
-		return TDB_SUCCESS;
-	}
-
-	blk = off / PAGESIZE;
-	off = off % PAGESIZE;
-
-	if (tdb->transaction->num_blocks <= blk) {
-		uint8_t **new_blocks;
-		/* expand the blocks array */
-		if (tdb->transaction->blocks == NULL) {
-			new_blocks = (uint8_t **)malloc(
-				(blk+1)*sizeof(uint8_t *));
-		} else {
-			new_blocks = (uint8_t **)realloc(
-				tdb->transaction->blocks,
-				(blk+1)*sizeof(uint8_t *));
-		}
-		if (new_blocks == NULL) {
-			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-					   "transaction_write:"
-					   " failed to allocate");
-			goto fail;
-		}
-		memset(&new_blocks[tdb->transaction->num_blocks], 0,
-		       (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
-		tdb->transaction->blocks = new_blocks;
-		tdb->transaction->num_blocks = blk+1;
-		tdb->transaction->last_block_size = 0;
-	}
-
-	/* allocate and fill a block? */
-	if (tdb->transaction->blocks[blk] == NULL) {
-		tdb->transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1);
-		if (tdb->transaction->blocks[blk] == NULL) {
-			ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-					   "transaction_write:"
-					   " failed to allocate");
-			goto fail;
-		}
-		if (tdb->transaction->old_map_size > blk * PAGESIZE) {
-			tdb_len_t len2 = PAGESIZE;
-			if (len2 + (blk * PAGESIZE) > tdb->transaction->old_map_size) {
-				len2 = tdb->transaction->old_map_size - (blk * PAGESIZE);
-			}
-			ecode = tdb->transaction->io_methods->tread(tdb,
-					blk * PAGESIZE,
-					tdb->transaction->blocks[blk],
-					len2);
-			if (ecode != TDB_SUCCESS) {
-				ecode = tdb_logerr(tdb, ecode,
-						   TDB_LOG_ERROR,
-						   "transaction_write:"
-						   " failed to"
-						   " read old block: %s",
-						   strerror(errno));
-				SAFE_FREE(tdb->transaction->blocks[blk]);
-				goto fail;
-			}
-			if (blk == tdb->transaction->num_blocks-1) {
-				tdb->transaction->last_block_size = len2;
-			}
-		}
-	}
-
-	/* overwrite part of an existing block */
-	if (buf == NULL) {
-		memset(tdb->transaction->blocks[blk] + off, 0, len);
-	} else {
-		memcpy(tdb->transaction->blocks[blk] + off, buf, len);
-	}
-	if (blk == tdb->transaction->num_blocks-1) {
-		if (len + off > tdb->transaction->last_block_size) {
-			tdb->transaction->last_block_size = len + off;
-		}
-	}
-
-	return TDB_SUCCESS;
-
-fail:
-	tdb->transaction->transaction_error = 1;
-	return ecode;
-}
-
-
-/*
-  write while in a transaction - this variant never expands the transaction blocks, it only
-  updates existing blocks. This means it cannot change the recovery size
-*/
-static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
-				       const void *buf, tdb_len_t len)
-{
-	size_t blk;
-
-	/* break it up into block sized chunks */
-	while (len + (off % PAGESIZE) > PAGESIZE) {
-		tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
-		transaction_write_existing(tdb, off, buf, len2);
-		len -= len2;
-		off += len2;
-		if (buf != NULL) {
-			buf = (const void *)(len2 + (const char *)buf);
-		}
-	}
-
-	if (len == 0) {
-		return;
-	}
-
-	blk = off / PAGESIZE;
-	off = off % PAGESIZE;
-
-	if (tdb->transaction->num_blocks <= blk ||
-	    tdb->transaction->blocks[blk] == NULL) {
-		return;
-	}
-
-	if (blk == tdb->transaction->num_blocks-1 &&
-	    off + len > tdb->transaction->last_block_size) {
-		if (off >= tdb->transaction->last_block_size) {
-			return;
-		}
-		len = tdb->transaction->last_block_size - off;
-	}
-
-	/* overwrite part of an existing block */
-	memcpy(tdb->transaction->blocks[blk] + off, buf, len);
-}
-
-
-/*
-  out of bounds check during a transaction
-*/
-static enum TDB_ERROR transaction_oob(struct tdb_context *tdb,
-				      tdb_off_t off, tdb_len_t len, bool probe)
-{
-	if ((off + len >= off && off + len <= tdb->file->map_size) || probe) {
-		return TDB_SUCCESS;
-	}
-
-	tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-		   "tdb_oob len %lld beyond transaction size %lld",
-		   (long long)(off + len),
-		   (long long)tdb->file->map_size);
-	return TDB_ERR_IO;
-}
-
-/*
-  transaction version of tdb_expand().
-*/
-static enum TDB_ERROR transaction_expand_file(struct tdb_context *tdb,
-					      tdb_off_t addition)
-{
-	enum TDB_ERROR ecode;
-
-	/* add a write to the transaction elements, so subsequent
-	   reads see the zero data */
-	ecode = transaction_write(tdb, tdb->file->map_size, NULL, addition);
-	if (ecode == TDB_SUCCESS) {
-		tdb->file->map_size += addition;
-	}
-	return ecode;
-}
-
-static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
-				size_t len, bool write_mode)
-{
-	size_t blk = off / PAGESIZE, end_blk;
-
-	/* This is wrong for zero-length blocks, but will fail gracefully */
-	end_blk = (off + len - 1) / PAGESIZE;
-
-	/* Can only do direct if in single block and we've already copied. */
-	if (write_mode) {
-		tdb->stats.transaction_write_direct++;
-		if (blk != end_blk
-		    || blk >= tdb->transaction->num_blocks
-		    || tdb->transaction->blocks[blk] == NULL) {
-			tdb->stats.transaction_write_direct_fail++;
-			return NULL;
-		}
-		return tdb->transaction->blocks[blk] + off % PAGESIZE;
-	}
-
-	tdb->stats.transaction_read_direct++;
-	/* Single which we have copied? */
-	if (blk == end_blk
-	    && blk < tdb->transaction->num_blocks
-	    && tdb->transaction->blocks[blk])
-		return tdb->transaction->blocks[blk] + off % PAGESIZE;
-
-	/* Otherwise must be all not copied. */
-	while (blk <= end_blk) {
-		if (blk >= tdb->transaction->num_blocks)
-			break;
-		if (tdb->transaction->blocks[blk]) {
-			tdb->stats.transaction_read_direct_fail++;
-			return NULL;
-		}
-		blk++;
-	}
-	return tdb->transaction->io_methods->direct(tdb, off, len, false);
-}
-
-static const struct tdb_methods transaction_methods = {
-	transaction_read,
-	transaction_write,
-	transaction_oob,
-	transaction_expand_file,
-	transaction_direct,
-};
-
-/*
-  sync to disk
-*/
-static enum TDB_ERROR transaction_sync(struct tdb_context *tdb,
-				       tdb_off_t offset, tdb_len_t length)
-{
-	if (tdb->flags & TDB_NOSYNC) {
-		return TDB_SUCCESS;
-	}
-
-	if (fsync(tdb->file->fd) != 0) {
-		return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-				  "tdb_transaction: fsync failed: %s",
-				  strerror(errno));
-	}
-#ifdef MS_SYNC
-	if (tdb->file->map_ptr) {
-		tdb_off_t moffset = offset & ~(getpagesize()-1);
-		if (msync(moffset + (char *)tdb->file->map_ptr,
-			  length + (offset - moffset), MS_SYNC) != 0) {
-			return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
-					  "tdb_transaction: msync failed: %s",
-					  strerror(errno));
-		}
-	}
-#endif
-	return TDB_SUCCESS;
-}
-
-
-static void _tdb_transaction_cancel(struct tdb_context *tdb)
-{
-	int i;
-	enum TDB_ERROR ecode;
-
-	if (tdb->transaction == NULL) {
-		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
-			   "tdb_transaction_cancel: no transaction");
-		return;
-	}
-
-	if (tdb->transaction->nesting != 0) {
-		tdb->transaction->transaction_error = 1;
-		tdb->transaction->nesting--;
-		return;
-	}
-
-	tdb->file->map_size = tdb->transaction->old_map_size;
-
-	/* free all the transaction blocks */
-	for (i=0;i<tdb->transaction->num_blocks;i++) {
-		if (tdb->transaction->blocks[i] != NULL) {
-			free(tdb->transaction->blocks[i]);
-		}
-	}
-	SAFE_FREE(tdb->transaction->blocks);
-
-	if (tdb->transaction->magic_offset) {
-		const struct tdb_methods *methods = tdb->transaction->io_methods;
-		uint64_t invalid = TDB_RECOVERY_INVALID_MAGIC;
-
-		/* remove the recovery marker */
-		ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
-					&invalid, sizeof(invalid));
-		if (ecode == TDB_SUCCESS)
-			ecode = transaction_sync(tdb,
-						 tdb->transaction->magic_offset,
-						 sizeof(invalid));
-		if (ecode != TDB_SUCCESS) {
-			tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				   "tdb_transaction_cancel: failed to remove"
-				   " recovery magic");
-		}
-	}
-
-	if (tdb->file->allrecord_lock.count)
-		tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
-
-	/* restore the normal io methods */
-	tdb->io = tdb->transaction->io_methods;
-
-	tdb_transaction_unlock(tdb, F_WRLCK);
-
-	if (tdb_has_open_lock(tdb))
-		tdb_unlock_open(tdb, F_WRLCK);
-
-	SAFE_FREE(tdb->transaction);
-}
-
-/*
-  start a tdb transaction. No token is returned, as only a single
-  transaction is allowed to be pending per tdb_context
-*/
-_PUBLIC_ enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb)
-{
-	enum TDB_ERROR ecode;
-
-	tdb->stats.transactions++;
-	/* some sanity checks */
-	if (tdb->flags & TDB_INTERNAL) {
-		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
-						    TDB_LOG_USE_ERROR,
-						    "tdb_transaction_start:"
-						    " cannot start a"
-						    " transaction on an"
-						    " internal tdb");
-	}
-
-	if (tdb->flags & TDB_RDONLY) {
-		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY,
-						    TDB_LOG_USE_ERROR,
-						    "tdb_transaction_start:"
-						    " cannot start a"
-						    " transaction on a "
-						    " read-only tdb");
-	}
-
-	/* cope with nested tdb_transaction_start() calls */
-	if (tdb->transaction != NULL) {
-		if (!(tdb->flags & TDB_ALLOW_NESTING)) {
-			return tdb->last_error
-				= tdb_logerr(tdb, TDB_ERR_IO,
-					     TDB_LOG_USE_ERROR,
-					     "tdb_transaction_start:"
-					     " already inside transaction");
-		}
-		tdb->transaction->nesting++;
-		tdb->stats.transaction_nest++;
-		return 0;
-	}
-
-	if (tdb_has_hash_locks(tdb)) {
-		/* the caller must not have any locks when starting a
-		   transaction as otherwise we'll be screwed by lack
-		   of nested locks in POSIX */
-		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK,
-						    TDB_LOG_USE_ERROR,
-						    "tdb_transaction_start:"
-						    " cannot start a"
-						    " transaction with locks"
-						    " held");
-	}
-
-	tdb->transaction = (struct tdb_transaction *)
-		calloc(sizeof(struct tdb_transaction), 1);
-	if (tdb->transaction == NULL) {
-		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM,
-						    TDB_LOG_ERROR,
-						    "tdb_transaction_start:"
-						    " cannot allocate");
-	}
-
-	/* get the transaction write lock. This is a blocking lock. As
-	   discussed with Volker, there are a number of ways we could
-	   make this async, which we will probably do in the future */
-	ecode = tdb_transaction_lock(tdb, F_WRLCK);
-	if (ecode != TDB_SUCCESS) {
-		SAFE_FREE(tdb->transaction->blocks);
-		SAFE_FREE(tdb->transaction);
-		return tdb->last_error = ecode;
-	}
-
-	/* get a read lock over entire file. This is upgraded to a write
-	   lock during the commit */
-	ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true);
-	if (ecode != TDB_SUCCESS) {
-		goto fail_allrecord_lock;
-	}
-
-	/* make sure we know about any file expansions already done by
-	   anyone else */
-	tdb->io->oob(tdb, tdb->file->map_size, 1, true);
-	tdb->transaction->old_map_size = tdb->file->map_size;
-
-	/* finally hook the io methods, replacing them with
-	   transaction specific methods */
-	tdb->transaction->io_methods = tdb->io;
-	tdb->io = &transaction_methods;
-	return tdb->last_error = TDB_SUCCESS;
-
-fail_allrecord_lock:
-	tdb_transaction_unlock(tdb, F_WRLCK);
-	SAFE_FREE(tdb->transaction->blocks);
-	SAFE_FREE(tdb->transaction);
-	return tdb->last_error = ecode;
-}
-
-
-/*
-  cancel the current transaction
-*/
-_PUBLIC_ void tdb_transaction_cancel(struct tdb_context *tdb)
-{
-	tdb->stats.transaction_cancel++;
-	_tdb_transaction_cancel(tdb);
-}
-
-/*
-  work out how much space the linearised recovery data will consume (worst case)
-*/
-static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
-{
-	tdb_len_t recovery_size = 0;
-	int i;
-
-	recovery_size = 0;
-	for (i=0;i<tdb->transaction->num_blocks;i++) {
-		if (i * PAGESIZE >= tdb->transaction->old_map_size) {
-			break;
-		}
-		if (tdb->transaction->blocks[i] == NULL) {
-			continue;
-		}
-		recovery_size += 2*sizeof(tdb_off_t);
-		if (i == tdb->transaction->num_blocks-1) {
-			recovery_size += tdb->transaction->last_block_size;
-		} else {
-			recovery_size += PAGESIZE;
-		}
-	}
-
-	return recovery_size;
-}
-
-static enum TDB_ERROR tdb_recovery_area(struct tdb_context *tdb,
-					const struct tdb_methods *methods,
-					tdb_off_t *recovery_offset,
-					struct tdb_recovery_record *rec)
-{
-	enum TDB_ERROR ecode;
-
-	*recovery_offset = tdb_read_off(tdb,
-					offsetof(struct tdb_header, recovery));
-	if (TDB_OFF_IS_ERR(*recovery_offset)) {
-		return TDB_OFF_TO_ERR(*recovery_offset);
-	}
-
-	if (*recovery_offset == 0) {
-		rec->max_len = 0;
-		return TDB_SUCCESS;
-	}
-
-	ecode = methods->tread(tdb, *recovery_offset, rec, sizeof(*rec));
-	if (ecode != TDB_SUCCESS)
-		return ecode;
-
-	tdb_convert(tdb, rec, sizeof(*rec));
-	/* ignore invalid recovery regions: can happen in crash */
-	if (rec->magic != TDB_RECOVERY_MAGIC &&
-	    rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
-		*recovery_offset = 0;
-		rec->max_len = 0;
-	}
-	return TDB_SUCCESS;
-}
-
-static unsigned int same(const unsigned char *new,
-			 const unsigned char *old,
-			 unsigned int length)
-{
-	unsigned int i;
-
-	for (i = 0; i < length; i++) {
-		if (new[i] != old[i])
-			break;
-	}
-	return i;
-}
-
-static unsigned int different(const unsigned char *new,
-			      const unsigned char *old,
-			      unsigned int length,
-			      unsigned int min_same,
-			      unsigned int *samelen)
-{
-	unsigned int i;
-
-	*samelen = 0;
-	for (i = 0; i < length; i++) {
-		if (new[i] == old[i]) {
-			(*samelen)++;
-		} else {
-			if (*samelen >= min_same) {
-				return i - *samelen;
-			}
-			*samelen = 0;
-		}
-	}
-
-	if (*samelen < min_same)
-		*samelen = 0;
-	return length - *samelen;
-}
-
-/* Allocates recovery blob, without tdb_recovery_record at head set up. */
-static struct tdb_recovery_record *alloc_recovery(struct tdb_context *tdb,
-						  tdb_len_t *len)
-{
-	struct tdb_recovery_record *rec;
-	size_t i;
-	enum TDB_ERROR ecode;
-	unsigned char *p;
-	const struct tdb_methods *old_methods = tdb->io;
-
-	rec = malloc(sizeof(*rec) + tdb_recovery_size(tdb));
-	if (!rec) {
-		tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-			   "transaction_setup_recovery:"
-			   " cannot allocate");
-		return TDB_ERR_PTR(TDB_ERR_OOM);
-	}
-
-	/* We temporarily revert to the old I/O methods, so we can use
-	 * tdb_access_read */
-	tdb->io = tdb->transaction->io_methods;
-
-	/* build the recovery data into a single blob to allow us to do a single
-	   large write, which should be more efficient */
-	p = (unsigned char *)(rec + 1);
-	for (i=0;i<tdb->transaction->num_blocks;i++) {
-		tdb_off_t offset;
-		tdb_len_t length;
-		unsigned int off;
-		const unsigned char *buffer;
-
-		if (tdb->transaction->blocks[i] == NULL) {
-			continue;
-		}
-
-		offset = i * PAGESIZE;
-		length = PAGESIZE;
-		if (i == tdb->transaction->num_blocks-1) {
-			length = tdb->transaction->last_block_size;
-		}
-
-		if (offset >= tdb->transaction->old_map_size) {
-			continue;
-		}
-
-		if (offset + length > tdb->file->map_size) {
-			ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-					   "tdb_transaction_setup_recovery:"
-					   " transaction data over new region"
-					   " boundary");
-			goto fail;
-		}
-		if (offset + length > tdb->transaction->old_map_size) {
-			/* Short read at EOF. */
-			length = tdb->transaction->old_map_size - offset;
-		}
-		buffer = tdb_access_read(tdb, offset, length, false);
-		if (TDB_PTR_IS_ERR(buffer)) {
-			ecode = TDB_PTR_ERR(buffer);
-			goto fail;
-		}
-
-		/* Skip over anything the same at the start. */
-		off = same(tdb->transaction->blocks[i], buffer, length);
-		offset += off;
-
-		while (off < length) {
-			tdb_len_t len1;
-			unsigned int samelen;
-
-			len1 = different(tdb->transaction->blocks[i] + off,
-					buffer + off, length - off,
-					sizeof(offset) + sizeof(len1) + 1,
-					&samelen);
-
-			memcpy(p, &offset, sizeof(offset));
-			memcpy(p + sizeof(offset), &len1, sizeof(len1));
-			tdb_convert(tdb, p, sizeof(offset) + sizeof(len1));
-			p += sizeof(offset) + sizeof(len1);
-			memcpy(p, buffer + off, len1);
-			p += len1;
-			off += len1 + samelen;
-			offset += len1 + samelen;
-		}
-		tdb_access_release(tdb, buffer);
-	}
-
-	*len = p - (unsigned char *)(rec + 1);
-	tdb->io = old_methods;
-	return rec;
-
-fail:
-	free(rec);
-	tdb->io = old_methods;
-	return TDB_ERR_PTR(ecode);
-}
-
-static tdb_off_t create_recovery_area(struct tdb_context *tdb,
-				      tdb_len_t rec_length,
-				      struct tdb_recovery_record *rec)
-{
-	tdb_off_t off, recovery_off;
-	tdb_len_t addition;
-	enum TDB_ERROR ecode;
-	const struct tdb_methods *methods = tdb->transaction->io_methods;
-
-	/* round up to a multiple of page size. Overallocate, since each
-	 * such allocation forces us to expand the file. */
-	rec->max_len = tdb_expand_adjust(tdb->file->map_size, rec_length);
-
-	/* Round up to a page. */
-	rec->max_len = ((sizeof(*rec) + rec->max_len + PAGESIZE-1)
-			& ~(PAGESIZE-1))
-		- sizeof(*rec);
-
-	off = tdb->file->map_size;
-
-	/* Restore ->map_size before calling underlying expand_file.
-	   Also so that we don't try to expand the file again in the
-	   transaction commit, which would destroy the recovery
-	   area */
-	addition = (tdb->file->map_size - tdb->transaction->old_map_size) +
-		sizeof(*rec) + rec->max_len;
-	tdb->file->map_size = tdb->transaction->old_map_size;
-	tdb->stats.transaction_expand_file++;
-	ecode = methods->expand_file(tdb, addition);
-	if (ecode != TDB_SUCCESS) {
-		tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-			   "tdb_recovery_allocate:"
-			   " failed to create recovery area");
-		return TDB_ERR_TO_OFF(ecode);
-	}
-
-	/* we have to reset the old map size so that we don't try to
-	   expand the file again in the transaction commit, which
-	   would destroy the recovery area */
-	tdb->transaction->old_map_size = tdb->file->map_size;
-
-	/* write the recovery header offset and sync - we can sync without a race here
-	   as the magic ptr in the recovery record has not been set */
-	recovery_off = off;
-	tdb_convert(tdb, &recovery_off, sizeof(recovery_off));
-	ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery),
-				&recovery_off, sizeof(tdb_off_t));
-	if (ecode != TDB_SUCCESS) {
-		tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-			   "tdb_recovery_allocate:"
-			   " failed to write recovery head");
-		return TDB_ERR_TO_OFF(ecode);
-	}
-	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
-				   &recovery_off,
-				   sizeof(tdb_off_t));
-	return off;
-}
-
-/*
-  setup the recovery data that will be used on a crash during commit
-*/
-static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb)
-{
-	tdb_len_t recovery_size = 0;
-	tdb_off_t recovery_off = 0;
-	tdb_off_t old_map_size = tdb->transaction->old_map_size;
-	struct tdb_recovery_record *recovery;
-	const struct tdb_methods *methods = tdb->transaction->io_methods;
-	uint64_t magic;
-	enum TDB_ERROR ecode;
-
-	recovery = alloc_recovery(tdb, &recovery_size);
-	if (TDB_PTR_IS_ERR(recovery))
-		return TDB_PTR_ERR(recovery);
-
-	ecode = tdb_recovery_area(tdb, methods, &recovery_off, recovery);
-	if (ecode) {
-		free(recovery);
-		return ecode;
-	}
-
-	if (recovery->max_len < recovery_size) {
-		/* Not large enough. Free up old recovery area. */
-		if (recovery_off) {
-			tdb->stats.frees++;
-			ecode = add_free_record(tdb, recovery_off,
-						sizeof(*recovery)
-						+ recovery->max_len,
-						TDB_LOCK_WAIT, true);
-			free(recovery);
-			if (ecode != TDB_SUCCESS) {
-				return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-						  "tdb_recovery_allocate:"
-						  " failed to free previous"
-						  " recovery area");
-			}
-
-			/* Refresh recovery after add_free_record above. */
-			recovery = alloc_recovery(tdb, &recovery_size);
-			if (TDB_PTR_IS_ERR(recovery))
-				return TDB_PTR_ERR(recovery);
-		}
-
-		recovery_off = create_recovery_area(tdb, recovery_size,
-						    recovery);
-		if (TDB_OFF_IS_ERR(recovery_off)) {
-			free(recovery);
-			return TDB_OFF_TO_ERR(recovery_off);
-		}
-	}
-
-	/* Now we know size, convert rec header. */
-	recovery->magic = TDB_RECOVERY_INVALID_MAGIC;
-	recovery->len = recovery_size;
-	recovery->eof = old_map_size;
-	tdb_convert(tdb, recovery, sizeof(*recovery));
-
-	/* write the recovery data to the recovery area */
-	ecode = methods->twrite(tdb, recovery_off, recovery, recovery_size);
-	if (ecode != TDB_SUCCESS) {
-		free(recovery);
-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				  "tdb_transaction_setup_recovery:"
-				  " failed to write recovery data");
-	}
-	transaction_write_existing(tdb, recovery_off, recovery, recovery_size);
-
-	free(recovery);
-
-	/* as we don't have ordered writes, we have to sync the recovery
-	   data before we update the magic to indicate that the recovery
-	   data is present */
-	ecode = transaction_sync(tdb, recovery_off, recovery_size);
-	if (ecode != TDB_SUCCESS)
-		return ecode;
-
-	magic = TDB_RECOVERY_MAGIC;
-	tdb_convert(tdb, &magic, sizeof(magic));
-
-	tdb->transaction->magic_offset
-		= recovery_off + offsetof(struct tdb_recovery_record, magic);
-
-	ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
-				&magic, sizeof(magic));
-	if (ecode != TDB_SUCCESS) {
-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				  "tdb_transaction_setup_recovery:"
-				  " failed to write recovery magic");
-	}
-	transaction_write_existing(tdb, tdb->transaction->magic_offset,
-				   &magic, sizeof(magic));
-
-	/* ensure the recovery magic marker is on disk */
-	return transaction_sync(tdb, tdb->transaction->magic_offset,
-				sizeof(magic));
-}
-
-static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb)
-{
-	const struct tdb_methods *methods;
-	enum TDB_ERROR ecode;
-
-	if (tdb->transaction == NULL) {
-		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
-				  "tdb_transaction_prepare_commit:"
-				  " no transaction");
-	}
-
-	if (tdb->transaction->prepared) {
-		_tdb_transaction_cancel(tdb);
-		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
-				  "tdb_transaction_prepare_commit:"
-				  " transaction already prepared");
-	}
-
-	if (tdb->transaction->transaction_error) {
-		_tdb_transaction_cancel(tdb);
-		return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
-				  "tdb_transaction_prepare_commit:"
-				  " transaction error pending");
-	}
-
-
-	if (tdb->transaction->nesting != 0) {
-		return TDB_SUCCESS;
-	}
-
-	/* check for a null transaction */
-	if (tdb->transaction->blocks == NULL) {
-		return TDB_SUCCESS;
-	}
-
-	methods = tdb->transaction->io_methods;
-
-	/* upgrade the main transaction lock region to a write lock */
-	ecode = tdb_allrecord_upgrade(tdb, TDB_HASH_LOCK_START);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	/* get the open lock - this prevents new users attaching to the database
-	   during the commit */
-	ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
-	if (ecode != TDB_SUCCESS) {
-		return ecode;
-	}
-
-	/* Since we have whole db locked, we don't need the expansion lock. */
-	if (!(tdb->flags & TDB_NOSYNC)) {
-		/* Sets up tdb->transaction->recovery and
-		 * tdb->transaction->magic_offset. */
-		ecode = transaction_setup_recovery(tdb);
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-	}
-
-	tdb->transaction->prepared = true;
-
-	/* expand the file to the new size if needed */
-	if (tdb->file->map_size != tdb->transaction->old_map_size) {
-		tdb_len_t add;
-
-		add = tdb->file->map_size - tdb->transaction->old_map_size;
-		/* Restore original map size for tdb_expand_file */
-		tdb->file->map_size = tdb->transaction->old_map_size;
-		ecode = methods->expand_file(tdb, add);
-		if (ecode != TDB_SUCCESS) {
-			return ecode;
-		}
-	}
-
-	/* Keep the open lock until the actual commit */
-	return TDB_SUCCESS;
-}
-
-/*
-   prepare to commit the current transaction
-*/
-_PUBLIC_ enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb)
-{
-	return tdb->last_error = _tdb_transaction_prepare_commit(tdb);
-}
-
-/*
-  commit the current transaction
-*/
-_PUBLIC_ enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb)
-{
-	const struct tdb_methods *methods;
-	int i;
-	enum TDB_ERROR ecode;
-
-	if (tdb->transaction == NULL) {
-		return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
-						    TDB_LOG_USE_ERROR,
-						    "tdb_transaction_commit:"
-						    " no transaction");
-	}
-
-	tdb_trace(tdb, "tdb_transaction_commit");
-
-	if (tdb->transaction->nesting != 0) {
-		tdb->transaction->nesting--;
-		return tdb->last_error = TDB_SUCCESS;
-	}
-
-	/* check for a null transaction */
-	if (tdb->transaction->blocks == NULL) {
-		_tdb_transaction_cancel(tdb);
-		return tdb->last_error = TDB_SUCCESS;
-	}
-
-	if (!tdb->transaction->prepared) {
-		ecode = _tdb_transaction_prepare_commit(tdb);
-		if (ecode != TDB_SUCCESS) {
-			_tdb_transaction_cancel(tdb);
-			return tdb->last_error = ecode;
-		}
-	}
-
-	methods = tdb->transaction->io_methods;
-
-	/* perform all the writes */
-	for (i=0;i<tdb->transaction->num_blocks;i++) {
-		tdb_off_t offset;
-		tdb_len_t length;
-
-		if (tdb->transaction->blocks[i] == NULL) {
-			continue;
-		}
-
-		offset = i * PAGESIZE;
-		length = PAGESIZE;
-		if (i == tdb->transaction->num_blocks-1) {
-			length = tdb->transaction->last_block_size;
-		}
-
-		ecode = methods->twrite(tdb, offset,
-					tdb->transaction->blocks[i], length);
-		if (ecode != TDB_SUCCESS) {
-			/* we've overwritten part of the data and
-			   possibly expanded the file, so we need to
-			   run the crash recovery code */
-			tdb->io = methods;
-			tdb_transaction_recover(tdb);
-
-			_tdb_transaction_cancel(tdb);
-
-			return tdb->last_error = ecode;
-		}
-		SAFE_FREE(tdb->transaction->blocks[i]);
-	}
-
-	SAFE_FREE(tdb->transaction->blocks);
-	tdb->transaction->num_blocks = 0;
-
-	/* ensure the new data is on disk */
-	ecode = transaction_sync(tdb, 0, tdb->file->map_size);
-	if (ecode != TDB_SUCCESS) {
-		return tdb->last_error = ecode;
-	}
-
-	/*
-	  TODO: maybe write to some dummy hdr field, or write to magic
-	  offset without mmap, before the last sync, instead of the
-	  utime() call
-	*/
-
-	/* on some systems (like Linux 2.6.x) changes via mmap/msync
-	   don't change the mtime of the file, this means the file may
-	   not be backed up (as tdb rounding to block sizes means that
-	   file size changes are quite rare too). The following forces
-	   mtime changes when a transaction completes */
-#if HAVE_UTIME
-	utime(tdb->name, NULL);
-#endif
-
-	/* use a transaction cancel to free memory and remove the
-	   transaction locks: it "restores" map_size, too. */
-	tdb->transaction->old_map_size = tdb->file->map_size;
-	_tdb_transaction_cancel(tdb);
-
-	return tdb->last_error = TDB_SUCCESS;
-}
-
-
-/*
-  recover from an aborted transaction. Must be called with exclusive
-  database write access already established (including the open
-  lock to prevent new processes attaching)
-*/
-enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb)
-{
-	tdb_off_t recovery_head, recovery_eof;
-	unsigned char *data, *p;
-	struct tdb_recovery_record rec;
-	enum TDB_ERROR ecode;
-
-	/* find the recovery area */
-	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
-	if (TDB_OFF_IS_ERR(recovery_head)) {
-		ecode = TDB_OFF_TO_ERR(recovery_head);
-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				  "tdb_transaction_recover:"
-				  " failed to read recovery head");
-	}
-
-	if (recovery_head == 0) {
-		/* we have never allocated a recovery record */
-		return TDB_SUCCESS;
-	}
-
-	/* read the recovery record */
-	ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
-	if (ecode != TDB_SUCCESS) {
-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				  "tdb_transaction_recover:"
-				  " failed to read recovery record");
-	}
-
-	if (rec.magic != TDB_RECOVERY_MAGIC) {
-		/* there is no valid recovery data */
-		return TDB_SUCCESS;
-	}
-
-	if (tdb->flags & TDB_RDONLY) {
-		return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
-				  "tdb_transaction_recover:"
-				  " attempt to recover read only database");
-	}
-
-	recovery_eof = rec.eof;
-
-	data = (unsigned char *)malloc(rec.len);
-	if (data == NULL) {
-		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
-				  "tdb_transaction_recover:"
-				  " failed to allocate recovery data");
-	}
-
-	/* read the full recovery data */
-	ecode = tdb->io->tread(tdb, recovery_head + sizeof(rec), data,
-				    rec.len);
-	if (ecode != TDB_SUCCESS) {
-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				  "tdb_transaction_recover:"
-				  " failed to read recovery data");
-	}
-
-	/* recover the file data */
-	p = data;
-	while (p+sizeof(tdb_off_t)+sizeof(tdb_len_t) < data + rec.len) {
-		tdb_off_t ofs;
-		tdb_len_t len;
-		tdb_convert(tdb, p, sizeof(ofs) + sizeof(len));
-		memcpy(&ofs, p, sizeof(ofs));
-		memcpy(&len, p + sizeof(ofs), sizeof(len));
-		p += sizeof(ofs) + sizeof(len);
-
-		ecode = tdb->io->twrite(tdb, ofs, p, len);
-		if (ecode != TDB_SUCCESS) {
-			free(data);
-			return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-					  "tdb_transaction_recover:"
-					  " failed to recover %zu bytes"
-					  " at offset %zu",
-					  (size_t)len, (size_t)ofs);
-		}
-		p += len;
-	}
-
-	free(data);
-
-	ecode = transaction_sync(tdb, 0, tdb->file->map_size);
-	if (ecode != TDB_SUCCESS) {
-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				  "tdb_transaction_recover:"
-				  " failed to sync recovery");
-	}
-
-	/* if the recovery area is after the recovered eof then remove it */
-	if (recovery_eof <= recovery_head) {
-		ecode = tdb_write_off(tdb, offsetof(struct tdb_header,
-						    recovery),
-				      0);
-		if (ecode != TDB_SUCCESS) {
-			return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-					  "tdb_transaction_recover:"
-					  " failed to remove recovery head");
-		}
-	}
-
-	/* remove the recovery magic */
-	ecode = tdb_write_off(tdb,
-			      recovery_head
-			      + offsetof(struct tdb_recovery_record, magic),
-			      TDB_RECOVERY_INVALID_MAGIC);
-	if (ecode != TDB_SUCCESS) {
-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				  "tdb_transaction_recover:"
-				  " failed to remove recovery magic");
-	}
-
-	ecode = transaction_sync(tdb, 0, recovery_eof);
-	if (ecode != TDB_SUCCESS) {
-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
-				  "tdb_transaction_recover:"
-				  " failed to sync2 recovery");
-	}
-
-	tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
-		   "tdb_transaction_recover: recovered %zu byte database",
-		   (size_t)recovery_eof);
-
-	/* all done */
-	return TDB_SUCCESS;
-}
-
-tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb)
-{
-	tdb_off_t recovery_head;
-	struct tdb_recovery_record rec;
-	enum TDB_ERROR ecode;
-
-	/* find the recovery area */
-	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
-	if (TDB_OFF_IS_ERR(recovery_head)) {
-		return recovery_head;
-	}
-
-	if (recovery_head == 0) {
-		/* we have never allocated a recovery record */
-		return false;
-	}
-
-	/* read the recovery record */
-	ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
-	if (ecode != TDB_SUCCESS) {
-		return TDB_ERR_TO_OFF(ecode);
-	}
-
-	return (rec.magic == TDB_RECOVERY_MAGIC);
-}
diff --git a/lib/tdb2/traverse.c b/lib/tdb2/traverse.c
deleted file mode 100644
index ed51a9ee72..0000000000
--- a/lib/tdb2/traverse.c
+++ /dev/null
@@ -1,99 +0,0 @@
- /*
-   Trivial Database 2: traverse function.
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-
-_PUBLIC_ int64_t tdb_traverse_(struct tdb_context *tdb,
-		      int (*fn)(struct tdb_context *,
-				TDB_DATA, TDB_DATA, void *),
-		      void *p)
-{
-	enum TDB_ERROR ecode;
-	struct traverse_info tinfo;
-	struct tdb_data k, d;
-	int64_t count = 0;
-
-	k.dptr = NULL;
-	for (ecode = first_in_hash(tdb, &tinfo, &k, &d.dsize);
-	     ecode == TDB_SUCCESS;
-	     ecode = next_in_hash(tdb, &tinfo, &k, &d.dsize)) {
-		d.dptr = k.dptr + k.dsize;
-
-		count++;
-		if (fn && fn(tdb, k, d, p)) {
-			free(k.dptr);
-			tdb->last_error = TDB_SUCCESS;
-			return count;
-		}
-		free(k.dptr);
-	}
-
-	if (ecode != TDB_ERR_NOEXIST) {
-		return TDB_ERR_TO_OFF(tdb->last_error = ecode);
-	}
-	tdb->last_error = TDB_SUCCESS;
-	return count;
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key)
-{
-	struct traverse_info tinfo;
-
-	return tdb->last_error = first_in_hash(tdb, &tinfo, key, NULL);
-}
-
-/* We lock twice, not very efficient.  We could keep last key & tinfo cached. */
-_PUBLIC_ enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key)
-{
-	struct traverse_info tinfo;
-	struct hash_info h;
-	struct tdb_used_record rec;
-
-	tinfo.prev = find_and_lock(tdb, *key, F_RDLCK, &h, &rec, &tinfo);
-	free(key->dptr);
-	if (TDB_OFF_IS_ERR(tinfo.prev)) {
-		return tdb->last_error = TDB_OFF_TO_ERR(tinfo.prev);
-	}
-	tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
-
-	return tdb->last_error = next_in_hash(tdb, &tinfo, key, NULL);
-}
-
-static int wipe_one(struct tdb_context *tdb,
-		    TDB_DATA key, TDB_DATA data, enum TDB_ERROR *ecode)
-{
-	*ecode = tdb_delete(tdb, key);
-	return (*ecode != TDB_SUCCESS);
-}
-
-_PUBLIC_ enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb)
-{
-	enum TDB_ERROR ecode;
-	int64_t count;
-
-	ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
-	if (ecode != TDB_SUCCESS)
-		return tdb->last_error = ecode;
-
-	/* FIXME: Be smarter. */
-	count = tdb_traverse(tdb, wipe_one, &ecode);
-	if (count < 0)
-		ecode = TDB_OFF_TO_ERR(count);
-	tdb_allrecord_unlock(tdb, F_WRLCK);
-	return tdb->last_error = ecode;
-}
diff --git a/lib/tdb2/wscript b/lib/tdb2/wscript
deleted file mode 100644
index ef30f1b778..0000000000
--- a/lib/tdb2/wscript
+++ /dev/null
@@ -1,278 +0,0 @@
-#!/usr/bin/env python
-
-APPNAME = 'tdb'
-VERSION = '2.0.1'
-
-blddir = 'bin'
-
-import sys, os
-
-# find the buildtools directory
-srcdir = '.'
-while not os.path.exists(srcdir+'/buildtools') and len(srcdir.split('/')) < 5:
-    srcdir = '../' + srcdir
-sys.path.insert(0, srcdir + '/buildtools/wafsamba')
-
-import wafsamba, samba_dist, Options, Logs, glob
-
-samba_dist.DIST_DIRS('lib/tdb2:. lib/replace:lib/replace lib/ccan:lib/ccan buildtools:buildtools')
-
-def set_options(opt):
-    opt.BUILTIN_DEFAULT('replace,ccan')
-    opt.PRIVATE_EXTENSION_DEFAULT('tdb2', noextension='tdb2')
-    opt.RECURSE('lib/replace')
-    opt.add_option('--enable-developer',
-                   help=("Turn on developer warnings and debugging"),
-                   action="store_true", dest='developer', default=False)
-    opt.add_option('--enable-tdb2',
-                   help=("Use tdb2 API instead of tdb1 [True]"),
-                   action="store_true", dest='BUILD_TDB2', default=True)
-    opt.add_option('--disable-tdb2',
-                   help=("Use old tdb1 API instead of tdb2"),
-                   action="store_false", dest='BUILD_TDB2')
-    opt.add_option('--valgrind',
-                   help=("use valgrind on tests programs"),
-                   action="store_true", dest='VALGRIND', default=False)
-    opt.add_option('--valgrind-log',
-                   help=("where to put the valgrind log"),
-                   action="store", dest='VALGRINDLOG', default=None)
-    if opt.IN_LAUNCH_DIR():
-        opt.add_option('--disable-python',
-                       help=("disable the pytdb module"),
-                       action="store_true", dest='disable_python', default=False)
-
-def configure(conf):
-    if Options.options.developer:
-        conf.env.DEVELOPER_MODE = True
-
-    conf.env.TEST_RUN_SRC=['test/run-001-encode.c',
-                           'test/run-001-fls.c',
-                           'test/run-01-new_database.c',
-                           'test/run-02-expand.c',
-                           'test/run-03-coalesce.c',
-                           'test/run-04-basichash.c',
-                           'test/run-05-readonly-open.c',
-                           'test/run-10-simple-store.c',
-                           'test/run-11-simple-fetch.c',
-                           'test/run-12-check.c',
-                           'test/run-15-append.c',
-                           'test/run-20-growhash.c',
-                           'test/run-25-hashoverload.c',
-                           'test/run-30-exhaust-before-expand.c',
-                           'test/run-35-convert.c',
-                           'test/run-50-multiple-freelists.c',
-                           'test/run-56-open-during-transaction.c',
-                           'test/run-57-die-during-transaction.c',
-                           'test/run-64-bit-tdb.c',
-                           'test/run-90-get-set-attributes.c',
-                           'test/run-capabilities.c',
-                           'test/run-expand-in-transaction.c',
-                           'test/run-features.c',
-                           'test/run-lockall.c',
-                           'test/run-remap-in-read_traverse.c',
-                           'test/run-seed.c',
-                           'test/run-tdb_errorstr.c',
-                           'test/run-tdb_foreach.c',
-                           'test/run-traverse.c']
-    conf.env.TEST_API_SRC=['test/api-12-store.c',
-                           'test/api-13-delete.c',
-                           'test/api-14-exists.c',
-                           'test/api-16-wipe_all.c',
-                           'test/api-21-parse_record.c',
-                           'test/api-55-transaction.c',
-                           'test/api-80-tdb_fd.c',
-                           'test/api-81-seqnum.c',
-                           'test/api-82-lockattr.c',
-                           'test/api-83-openhook.c',
-                           'test/api-91-get-stats.c',
-                           'test/api-92-get-set-readonly.c',
-                           'test/api-93-repack.c',
-                           'test/api-add-remove-flags.c',
-                           'test/api-check-callback.c',
-                           'test/api-firstkey-nextkey.c',
-                           'test/api-fork-test.c',
-                           'test/api-locktimeout.c',
-                           'test/api-missing-entries.c',
-                           'test/api-open-multiple-times.c',
-                           'test/api-record-expand.c',
-                           'test/api-simple-delete.c',
-                           'test/api-summary.c']
-    conf.env.TEST_API_HELPER_SRC=['test/helpapi-external-agent.c']
-    conf.env.TEST_RUN_HELPER_SRC=['test/helprun-external-agent.c',
-                                  'test/helprun-layout.c']
-    conf.env.TEST_HELPER_SRC=['test/external-agent.c',
-                              'test/failtest_helper.c',
-                              'test/lock-tracking.c',
-                              'test/logging.c',
-                              'test/tap-interface.c']
-
-    if Options.options.BUILD_TDB2:
-        conf.DEFINE('BUILD_TDB2', 1)
-        conf.RECURSE('lib/replace')
-        conf.RECURSE('lib/ccan')
-
-        conf.env.standalone_tdb2 = conf.IN_LAUNCH_DIR()
-        conf.env.disable_python = getattr(Options.options, 'disable_python', False)
-
-        if not conf.env.standalone_tdb2:
-            if conf.CHECK_BUNDLED_SYSTEM('tdb', minversion=VERSION,
-                                         implied_deps='replace'):
-                conf.define('USING_SYSTEM_TDB2', 1)
-                if conf.CHECK_BUNDLED_SYSTEM_PYTHON('pytdb', 'tdb', minversion=VERSION):
-                    conf.define('USING_SYSTEM_PYTDB', 1)
-
-        if not conf.env.disable_python:
-            # also disable if we don't have the python libs installed
-            conf.find_program('python', var='PYTHON')
-            conf.check_tool('python')
-            conf.check_python_version((2,4,2))
-            conf.SAMBA_CHECK_PYTHON_HEADERS(mandatory=False)
-            if not conf.env.HAVE_PYTHON_H:
-                Logs.warn('Disabling pytdb as python devel libs not found')
-                conf.env.disable_python = True
-
-        # This make #include <ccan/...> work.
-        conf.ADD_EXTRA_INCLUDES('''#lib''')
-
-        conf.SAMBA_CONFIG_H()
-
-def build(bld):
-    if bld.env.BUILD_TDB2:
-        bld.RECURSE('lib/replace')
-        bld.RECURSE('lib/ccan')
-
-        if bld.env.standalone_tdb2:
-            bld.env.PKGCONFIGDIR = '${LIBDIR}/pkgconfig'
-            private_library = False
-        else:
-            private_library = True
-
-        SRC = '''check.c free.c hash.c io.c lock.c open.c
-                 summary.c tdb.c transaction.c traverse.c'''
-
-        if not bld.CONFIG_SET('USING_SYSTEM_TDB2'):
-            bld.SAMBA_LIBRARY('tdb',
-                              SRC,
-                              deps='replace ccan',
-                              includes='.',
-                              abi_directory='ABI',
-                              abi_match='tdb_*',
-                              hide_symbols=True,
-                              vnum=VERSION,
-                              public_headers='tdb2.h',
-                              public_headers_install=not private_library,
-                              pc_files='tdb.pc',
-                              private_library=private_library)
-
-            bld.SAMBA_BINARY('tdbtorture',
-                             'tools/tdb2torture.c',
-                             deps='tdb',
-                             install=False)
-
-            bld.SAMBA_BINARY('tdbtool',
-                             'tools/tdb2tool.c',
-                             deps='tdb')
-
-            bld.SAMBA_BINARY('tdbdump',
-                             'tools/tdb2dump.c',
-                             deps='tdb')
-
-            bld.SAMBA_BINARY('tdbrestore',
-                             'tools/tdb2restore.c',
-                             deps='tdb')
-
-            bld.SAMBA_BINARY('tdbbackup',
-                             'tools/tdb2backup.c',
-                             deps='tdb')
-
-            if not bld.CONFIG_SET('USING_SYSTEM_PYTDB'):
-                bld.SAMBA_PYTHON('pytdb',
-                                 source='pytdb.c',
-                                 deps='tdb',
-                                 enabled=not bld.env.disable_python,
-                                 realname='tdb.so',
-                                 cflags='-DPACKAGE_VERSION=\"%s\"' % VERSION)
-
-            if bld.env.DEVELOPER_MODE:
-                # FIXME: We need CCAN for some API tests, but waf thinks it's
-                # already available via tdb2.  It is, but not publicly.
-                # Workaround is to build a private, non-hiding version.
-                bld.SAMBA_SUBSYSTEM('tdb2-testing',
-                                    SRC,
-                                    deps='replace ccan',
-                                    includes='.')
-
-                bld.SAMBA_SUBSYSTEM('tdb2-test-helpers', bld.env.TEST_HELPER_SRC,
-                                    deps='replace')
-                bld.SAMBA_SUBSYSTEM('tdb2-run-helpers', bld.env.TEST_RUN_HELPER_SRC,
-                                    deps='replace')
-                bld.SAMBA_SUBSYSTEM('tdb2-api-helpers', bld.env.TEST_API_HELPER_SRC,
-                                    deps='replace tdb2-testing')
-
-                for f in bld.env.TEST_RUN_SRC:
-                    base = os.path.splitext(os.path.basename(f))[0]
-                    bld.SAMBA_BINARY('tdb2-' + base, f,
-                                     deps='ccan replace tdb2-test-helpers tdb2-run-helpers ccan-failtest',
-                                     install=False)
-
-                for f in bld.env.TEST_API_SRC:
-                    base = os.path.splitext(os.path.basename(f))[0]
-                    bld.SAMBA_BINARY('tdb2-' + base, f,
-                                     deps='ccan replace tdb2-test-helpers tdb2-api-helpers',
-                                     install=False)
-
-def testonly(ctx):
-    '''run tdb2 testsuite'''
-    import Utils, samba_utils, shutil
-    ecode = 0;
-
-    env = samba_utils.LOAD_ENVIRONMENT()
-
-    if env.BUILD_TDB2 and env.standalone_tdb2 and env.DEVELOPER_MODE:
-
-        # FIXME: This is horrible :(
-        test_prefix = "%s/st" % (Utils.g_module.blddir)
-        shutil.rmtree(test_prefix, ignore_errors=True)
-        os.makedirs(test_prefix)
-
-        # Create scratch directory for tests.
-        testdir = os.path.join(test_prefix, 'tdb2-tests')
-        samba_utils.mkdir_p(testdir)
-        # Symlink back to source dir so it can find tests in test/
-        link = os.path.join(testdir, 'test')
-        if not os.path.exists(link):
-            os.symlink(os.path.abspath(os.path.join(env.cwd, 'test')), link)
-
-        if Options.options.VALGRIND:
-            os.environ['VALGRIND'] = 'valgrind -q --num-callers=30'
-        if Options.options.VALGRINDLOG is not None:
-            os.environ['VALGRIND'] += ' --log-file=%s' % Options.options.VALGRINDLOG
-
-        for f in env.TEST_RUN_SRC + env.TEST_API_SRC:
-            name = "tdb2-" + os.path.splitext(os.path.basename(f))[0]
-            cmd = "cd " + testdir + " && $VALGRIND " + os.path.abspath(os.path.join(Utils.g_module.blddir, name)) + " > test-output 2>&1"
-            print("..." + f)
-            ret = samba_utils.RUN_COMMAND(cmd)
-            if ret != 0:
-                print("%s (%s) failed:" % (name, f))
-                samba_utils.RUN_COMMAND("cat " + os.path.join(testdir, 'test-output'))
-                ecode = ret;
-                break;
-
-    sys.exit(ecode)
-
-# WAF doesn't build the unit tests for this, maybe because they don't link with tdb?
-# This forces it
-def test(ctx):
-    import Scripting
-    Scripting.commands.append('build')
-    Scripting.commands.append('testonly')
-
-def dist():
-    '''makes a tarball for distribution'''
-    samba_dist.dist()
-
-def reconfigure(ctx):
-    '''reconfigure if config scripts have changed'''
-    import samba_utils
-    samba_utils.reconfigure(ctx)
diff --git a/script/autobuild.py b/script/autobuild.py
index 9fb0a7ced5..fcdfdb7dd3 100755
--- a/script/autobuild.py
+++ b/script/autobuild.py
@@ -21,6 +21,7 @@ builddirs = {
     "samba4-libs"  : ".",
     "ldb"     : "lib/ldb",
     "tdb"     : "lib/tdb",
+    "ntdb"    : "lib/ntdb",
     "talloc"  : "lib/talloc",
     "replace" : "lib/replace",
     "tevent"  : "lib/tevent",
@@ -30,7 +31,7 @@ builddirs = {
     "retry"   : "."
     }
 
-defaulttasks = [ "samba3", "samba4", "samba4-libs", "ldb", "tdb", "talloc", "replace", "tevent", "pidl" ]
+defaulttasks = [ "samba3", "samba4", "samba4-libs", "ldb", "tdb", "ntdb", "talloc", "replace", "tevent", "pidl" ]
 
 tasks = {
     "samba3" : [ ("autogen", "./autogen.sh", "text/plain"),
@@ -91,6 +92,14 @@ tasks = {
               ("distcheck", "make distcheck", "text/plain"),
               ("clean", "make clean", "text/plain") ],
 
+    "ntdb" : [ ("configure", "./configure --enable-developer -C ${PREFIX}", "text/plain"),
+               ("make", "make -j", "text/plain"),
+               ("install", "make install", "text/plain"),
+               ("test", "make test", "text/plain"),
+               ("check-clean-tree", "../../script/clean-source-tree.sh", "text/plain"),
+               ("distcheck", "make distcheck", "text/plain"),
+               ("clean", "make clean", "text/plain") ],
+
     "talloc" : [ ("configure", "./configure --enable-developer -C ${PREFIX}", "text/plain"),
                  ("make", "make -j", "text/plain"),
                  ("install", "make install", "text/plain"),
-- 
cgit