diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2012-06-19 12:43:04 +0930 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2012-06-19 05:38:07 +0200 |
commit | dd42962878ab7c9ddfa79d7c32094fb6748017b8 (patch) | |
tree | a614af427c5ad0d962db77a58f133cb39c9bd057 /lib/ntdb/check.c | |
parent | f986554b1e38d8dd40b4bf4748d4aeb470e27d2e (diff) | |
download | samba-dd42962878ab7c9ddfa79d7c32094fb6748017b8.tar.gz samba-dd42962878ab7c9ddfa79d7c32094fb6748017b8.tar.bz2 samba-dd42962878ab7c9ddfa79d7c32094fb6748017b8.zip |
ntdb: remove hash table trees.
TDB2 started with a top-level hash of 1024 entries, divided into 128
groups of 8 buckets. When a bucket filled, the 8 bucket group
expanded into pointers into 8 new 64-entry hash tables. When these
filled, they expanded in turn, etc.
It's a nice idea to automatically expand the hash tables, but it
doesn't pay off. Remove it for NTDB.
1) It only beats TDB performance when the database is huge and the
TDB hashsize is small. We are about 20% slower on medium-size
databases (1000 to 10000 records), worse on really small ones.
2) Since we're 64 bits, our hash tables are already twice as expensive
as TDB.
3) Since our hash function is good, it means that all groups tend to
fill at the same time, meaning the hash enlarges by a factor of 128
all at once, leading to a very large database at that point.
4) Our efficiency would improve if we enlarged the top level, but
that makes our minimum db size even worse: it's already over 8k,
and jumps to 1M after about 1000 entries!
5) Making the sub group size larger gives a shallower tree, which
performs better, but makes the "hash explosion" problem worse.
6) The code is complicated, having to handle delete and reshuffling
groups of hash buckets, and expansion of buckets.
7) We have to handle the case where all the records somehow end up with
the same hash value, which requires special code to chain records for
that case.
On the other hand, it would be nice if we didn't degrade as badly as
TDB does when the hash chains get long.
This patch removes the hash-growing code, but instead of chaining like
TDB does when a bucket fills, we point the bucket to an array of
record pointers. Since each on-disk NTDB pointer contains some hash
bits from the record (we steal the upper 8 bits of the offset), 99.5%
of the time we don't need to load the record to determine if it
matches. This makes an array of offsets much more cache-friendly than
a linked list.
Here are the times (in ns) for tdb_store of N records, tdb_store of N
records the second time, and a fetch of all N records. I've also
included the final database size and the smbtorture local.[n]tdb_speed
results.
Benchmark details:
1) Compiled with -O2.
2) assert() was disabled in TDB2 and NTDB.
3) The "optimize fetch" patch was applied to NTDB.
10 runs, using tmpfs (otherwise massive swapping as db hits ~30M,
despite plenty of RAM).
Insert Re-ins Fetch Size dbspeed
(nsec) (nsec) (nsec) (Kb) (ops/sec)
TDB (10000 hashsize):
100 records: 3882 3320 1609 53 203204
1000 records: 3651 3281 1571 115 218021
10000 records: 3404 3326 1595 880 202874
100000 records: 4317 3825 2097 8262 126811
1000000 records: 11568 11578 9320 77005 25046
TDB2 (1024 hashsize, expandable):
100 records: 3867 3329 1699 17 187100
1000 records: 4040 3249 1639 154 186255
10000 records: 4143 3300 1695 1226 185110
100000 records: 4481 3425 1800 17848 163483
1000000 records: 4055 3534 1878 106386 160774
NTDB (8192 hashsize)
100 records: 4259 3376 1692 82 190852
1000 records: 3640 3275 1566 130 195106
10000 records: 4337 3438 1614 773 188362
100000 records: 4750 5165 1746 9001 169197
1000000 records: 4897 5180 2341 83838 121901
Analysis:
1) TDB wins on small databases, beating TDB2 by ~15%, NTDB by ~10%.
2) TDB starts to lose when hash chains get 10 long (fetch 10% slower
than TDB2/NTDB).
3) TDB does horribly when hash chains get 100 long (fetch 4x slower
than NTDB, 5x slower than TDB2, insert about 2-3x slower).
4) TDB2 databases are 40% larger than TDB1. NTDB is about 15% larger
than TDB1
Diffstat (limited to 'lib/ntdb/check.c')
-rw-r--r-- | lib/ntdb/check.c | 484 |
1 files changed, 169 insertions, 315 deletions
diff --git a/lib/ntdb/check.c b/lib/ntdb/check.c index be27003a51..2790c68eaf 100644 --- a/lib/ntdb/check.c +++ b/lib/ntdb/check.c @@ -38,8 +38,10 @@ static bool append(struct ntdb_context *ntdb, return true; } -static enum NTDB_ERROR check_header(struct ntdb_context *ntdb, ntdb_off_t *recovery, - uint64_t *features, size_t *num_capabilities) +static enum NTDB_ERROR check_header(struct ntdb_context *ntdb, + ntdb_off_t *recovery, + uint64_t *features, + size_t *num_capabilities) { uint64_t hash_test; struct ntdb_header hdr; @@ -112,374 +114,227 @@ static enum NTDB_ERROR check_header(struct ntdb_context *ntdb, ntdb_off_t *recov return NTDB_SUCCESS; } -static enum NTDB_ERROR check_hash_tree(struct ntdb_context *ntdb, - ntdb_off_t off, unsigned int group_bits, - uint64_t hprefix, - unsigned hprefix_bits, - ntdb_off_t used[], - size_t num_used, - size_t *num_found, - enum NTDB_ERROR (*check)(NTDB_DATA, - NTDB_DATA, void *), - void *data); +static int off_cmp(const ntdb_off_t *a, const ntdb_off_t *b) +{ + /* Can overflow an int. */ + return *a > *b ? 1 + : *a < *b ? -1 + : 0; +} -static enum NTDB_ERROR check_hash_chain(struct ntdb_context *ntdb, - ntdb_off_t off, - uint64_t hash, - ntdb_off_t used[], - size_t num_used, - size_t *num_found, - enum NTDB_ERROR (*check)(NTDB_DATA, - NTDB_DATA, - void *), - void *data) +static enum NTDB_ERROR check_entry(struct ntdb_context *ntdb, + ntdb_off_t off_and_hash, + ntdb_len_t bucket, + ntdb_off_t used[], + size_t num_used, + size_t *num_found, + enum NTDB_ERROR (*check)(NTDB_DATA, + NTDB_DATA, + void *), + void *data) { - struct ntdb_used_record rec; enum NTDB_ERROR ecode; - - ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec)); - if (ecode != NTDB_SUCCESS) { - return ecode; + const struct ntdb_used_record *r; + const unsigned char *kptr; + ntdb_len_t klen, dlen; + uint32_t hash; + ntdb_off_t off = off_and_hash & NTDB_OFF_MASK; + ntdb_off_t *p; + + /* Empty bucket is fine. */ + if (!off_and_hash) { + return NTDB_SUCCESS; } - if (rec_magic(&rec) != NTDB_CHAIN_MAGIC) { + /* This can't point to a chain, we handled those at toplevel. */ + if (off_and_hash & (1ULL << NTDB_OFF_CHAIN_BIT)) { return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check: Bad hash chain magic %llu", - (long long)rec_magic(&rec)); + "ntdb_check: Invalid chain bit in offset " + " %llu", (long long)off_and_hash); } - if (rec_data_length(&rec) != sizeof(struct ntdb_chain)) { - return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check:" - " Bad hash chain length %llu vs %zu", - (long long)rec_data_length(&rec), - sizeof(struct ntdb_chain)); - } - if (rec_key_length(&rec) != 0) { + p = asearch(&off, used, num_used, off_cmp); + if (!p) { return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check: Bad hash chain key length %llu", - (long long)rec_key_length(&rec)); - } - if (rec_hash(&rec) != 0) { - return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check: Bad hash chain hash value %llu", - (long long)rec_hash(&rec)); + "ntdb_check: Invalid offset" + " %llu in hash", (long long)off); } + /* Mark it invalid. */ + *p ^= 1; + (*num_found)++; - off += sizeof(rec); - ecode = check_hash_tree(ntdb, off, 0, hash, 64, - used, num_used, num_found, check, data); - if (ecode != NTDB_SUCCESS) { - return ecode; + r = ntdb_access_read(ntdb, off, sizeof(*r), true); + if (NTDB_PTR_IS_ERR(r)) { + return NTDB_PTR_ERR(r); + } + klen = rec_key_length(r); + dlen = rec_data_length(r); + ntdb_access_release(ntdb, r); + + kptr = ntdb_access_read(ntdb, off + sizeof(*r), klen + dlen, false); + if (NTDB_PTR_IS_ERR(kptr)) { + return NTDB_PTR_ERR(kptr); + } + + hash = ntdb_hash(ntdb, kptr, klen); + + /* Are we in the right chain? */ + if (bits_from(hash, 0, ntdb->hash_bits) != bucket) { + ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, + NTDB_LOG_ERROR, + "ntdb_check: Bad bucket %u vs %llu", + bits_from(hash, 0, ntdb->hash_bits), + (long long)bucket); + /* Next 8 bits should be the same as top bits of bucket. */ + } else if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL) + != bits_from(off_and_hash, 64-NTDB_OFF_UPPER_STEAL, + NTDB_OFF_UPPER_STEAL)) { + ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, + NTDB_LOG_ERROR, + "ntdb_check: Bad hash bits %llu vs %llu", + (long long)off_and_hash, + (long long)hash); + } else if (check) { + NTDB_DATA k, d; + + k = ntdb_mkdata(kptr, klen); + d = ntdb_mkdata(kptr + klen, dlen); + ecode = check(k, d, data); + } else { + ecode = NTDB_SUCCESS; } + ntdb_access_release(ntdb, kptr); - off = ntdb_read_off(ntdb, off + offsetof(struct ntdb_chain, next)); - if (NTDB_OFF_IS_ERR(off)) { - return NTDB_OFF_TO_ERR(off); - } - if (off == 0) - return NTDB_SUCCESS; - (*num_found)++; - return check_hash_chain(ntdb, off, hash, used, num_used, num_found, - check, data); + return ecode; } -static enum NTDB_ERROR check_hash_record(struct ntdb_context *ntdb, +static enum NTDB_ERROR check_hash_chain(struct ntdb_context *ntdb, ntdb_off_t off, - uint64_t hprefix, - unsigned hprefix_bits, + ntdb_len_t bucket, ntdb_off_t used[], size_t num_used, size_t *num_found, enum NTDB_ERROR (*check)(NTDB_DATA, - NTDB_DATA, - void *), + NTDB_DATA, + void *), void *data) { struct ntdb_used_record rec; enum NTDB_ERROR ecode; + const ntdb_off_t *entries; + ntdb_len_t i, num; - if (hprefix_bits >= 64) - return check_hash_chain(ntdb, off, hprefix, used, num_used, - num_found, check, data); + /* This is a used entry. */ + (*num_found)++; ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec)); if (ecode != NTDB_SUCCESS) { return ecode; } - if (rec_magic(&rec) != NTDB_HTABLE_MAGIC) { + if (rec_magic(&rec) != NTDB_CHAIN_MAGIC) { return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check: Bad hash table magic %llu", + "ntdb_check: Bad hash chain magic %llu", (long long)rec_magic(&rec)); } - if (rec_data_length(&rec) - != sizeof(ntdb_off_t) << NTDB_SUBLEVEL_HASH_BITS) { + + if (rec_data_length(&rec) % sizeof(ntdb_off_t)) { return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check:" - " Bad hash table length %llu vs %llu", - (long long)rec_data_length(&rec), - (long long)sizeof(ntdb_off_t) - << NTDB_SUBLEVEL_HASH_BITS); + "ntdb_check: Bad hash chain data length %llu", + (long long)rec_data_length(&rec)); } + if (rec_key_length(&rec) != 0) { return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check: Bad hash table key length %llu", + "ntdb_check: Bad hash chain key length %llu", (long long)rec_key_length(&rec)); } - if (rec_hash(&rec) != 0) { - return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check: Bad hash table hash value %llu", - (long long)rec_hash(&rec)); - } off += sizeof(rec); - return check_hash_tree(ntdb, off, - NTDB_SUBLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS, - hprefix, hprefix_bits, - used, num_used, num_found, check, data); -} - -static int off_cmp(const ntdb_off_t *a, const ntdb_off_t *b) -{ - /* Can overflow an int. */ - return *a > *b ? 1 - : *a < *b ? -1 - : 0; -} + num = rec_data_length(&rec) / sizeof(ntdb_off_t); + entries = ntdb_access_read(ntdb, off, rec_data_length(&rec), true); + if (NTDB_PTR_IS_ERR(entries)) { + return NTDB_PTR_ERR(entries); + } -static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used) -{ - *used += num; + /* Check each non-deleted entry in chain. */ + for (i = 0; i < num; i++) { + ecode = check_entry(ntdb, entries[i], bucket, + used, num_used, num_found, check, data); + if (ecode) { + break; + } + } - return (h >> (64 - *used)) & ((1U << num) - 1); + ntdb_access_release(ntdb, entries); + return ecode; } -static enum NTDB_ERROR check_hash_tree(struct ntdb_context *ntdb, - ntdb_off_t off, unsigned int group_bits, - uint64_t hprefix, - unsigned hprefix_bits, - ntdb_off_t used[], - size_t num_used, - size_t *num_found, - enum NTDB_ERROR (*check)(NTDB_DATA, - NTDB_DATA, void *), - void *data) +static enum NTDB_ERROR check_hash(struct ntdb_context *ntdb, + ntdb_off_t used[], + size_t num_used, + size_t num_other_used, + enum NTDB_ERROR (*check)(NTDB_DATA, + NTDB_DATA, + void *), + void *data) { - unsigned int g, b; - const ntdb_off_t *hash; - struct ntdb_used_record rec; enum NTDB_ERROR ecode; + struct ntdb_used_record rec; + const ntdb_off_t *entries; + ntdb_len_t i; + /* Free tables and capabilities also show up as used, as do we. */ + size_t num_found = num_other_used + 1; - hash = ntdb_access_read(ntdb, off, - sizeof(ntdb_off_t) - << (group_bits + NTDB_HASH_GROUP_BITS), - true); - if (NTDB_PTR_IS_ERR(hash)) { - return NTDB_PTR_ERR(hash); - } - - for (g = 0; g < (1 << group_bits); g++) { - const ntdb_off_t *group = hash + (g << NTDB_HASH_GROUP_BITS); - for (b = 0; b < (1 << NTDB_HASH_GROUP_BITS); b++) { - unsigned int bucket, i, used_bits; - uint64_t h; - ntdb_off_t *p; - if (group[b] == 0) - continue; - - off = group[b] & NTDB_OFF_MASK; - p = asearch(&off, used, num_used, off_cmp); - if (!p) { - ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, - NTDB_LOG_ERROR, - "ntdb_check: Invalid offset" - " %llu in hash", - (long long)off); - goto fail; - } - /* Mark it invalid. */ - *p ^= 1; - (*num_found)++; - - if (hprefix_bits == 64) { - /* Chained entries are unordered. */ - if (is_subhash(group[b])) { - ecode = NTDB_ERR_CORRUPT; - ntdb_logerr(ntdb, ecode, - NTDB_LOG_ERROR, - "ntdb_check: Invalid chain" - " entry subhash"); - goto fail; - } - h = hash_record(ntdb, off); - if (h != hprefix) { - ecode = NTDB_ERR_CORRUPT; - ntdb_logerr(ntdb, ecode, - NTDB_LOG_ERROR, - "check: bad hash chain" - " placement" - " 0x%llx vs 0x%llx", - (long long)h, - (long long)hprefix); - goto fail; - } - ecode = ntdb_read_convert(ntdb, off, &rec, - sizeof(rec)); - if (ecode != NTDB_SUCCESS) { - goto fail; - } - goto check; - } - - if (is_subhash(group[b])) { - uint64_t subprefix; - subprefix = (hprefix - << (group_bits + NTDB_HASH_GROUP_BITS)) - + g * (1 << NTDB_HASH_GROUP_BITS) + b; - - ecode = check_hash_record(ntdb, - group[b] & NTDB_OFF_MASK, - subprefix, - hprefix_bits - + group_bits - + NTDB_HASH_GROUP_BITS, - used, num_used, num_found, - check, data); - if (ecode != NTDB_SUCCESS) { - goto fail; - } - continue; - } - /* A normal entry */ - - /* Does it belong here at all? */ - h = hash_record(ntdb, off); - used_bits = 0; - if (get_bits(h, hprefix_bits, &used_bits) != hprefix - && hprefix_bits) { - ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, - NTDB_LOG_ERROR, - "check: bad hash placement" - " 0x%llx vs 0x%llx", - (long long)h, - (long long)hprefix); - goto fail; - } - - /* Does it belong in this group? */ - if (get_bits(h, group_bits, &used_bits) != g) { - ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, - NTDB_LOG_ERROR, - "check: bad group %llu" - " vs %u", - (long long)h, g); - goto fail; - } - - /* Are bucket bits correct? */ - bucket = group[b] & NTDB_OFF_HASH_GROUP_MASK; - if (get_bits(h, NTDB_HASH_GROUP_BITS, &used_bits) - != bucket) { - used_bits -= NTDB_HASH_GROUP_BITS; - ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, - NTDB_LOG_ERROR, - "check: bad bucket %u vs %u", - (unsigned)get_bits(h, - NTDB_HASH_GROUP_BITS, - &used_bits), - bucket); - goto fail; - } + ecode = ntdb_read_convert(ntdb, NTDB_HASH_OFFSET, &rec, sizeof(rec)); + if (ecode != NTDB_SUCCESS) { + return ecode; + } - /* There must not be any zero entries between - * the bucket it belongs in and this one! */ - for (i = bucket; - i != b; - i = (i + 1) % (1 << NTDB_HASH_GROUP_BITS)) { - if (group[i] == 0) { - ecode = NTDB_ERR_CORRUPT; - ntdb_logerr(ntdb, ecode, - NTDB_LOG_ERROR, - "check: bad group placement" - " %u vs %u", - b, bucket); - goto fail; - } - } + if (rec_magic(&rec) != NTDB_HTABLE_MAGIC) { + return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, + "ntdb_check: Bad hash table magic %llu", + (long long)rec_magic(&rec)); + } - ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec)); - if (ecode != NTDB_SUCCESS) { - goto fail; - } + if (rec_data_length(&rec) != (sizeof(ntdb_off_t) << ntdb->hash_bits)) { + return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, + "ntdb_check: Bad hash table data length %llu", + (long long)rec_data_length(&rec)); + } - /* Bottom bits must match header. */ - if ((h & ((1 << 11)-1)) != rec_hash(&rec)) { - ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, - NTDB_LOG_ERROR, - "ntdb_check: Bad hash magic" - " at offset %llu" - " (0x%llx vs 0x%llx)", - (long long)off, - (long long)h, - (long long)rec_hash(&rec)); - goto fail; - } + if (rec_key_length(&rec) != 0) { + return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, + "ntdb_check: Bad hash table key length %llu", + (long long)rec_key_length(&rec)); + } - check: - if (check) { - NTDB_DATA k, d; - const unsigned char *kptr; - - kptr = ntdb_access_read(ntdb, - off + sizeof(rec), - rec_key_length(&rec) - + rec_data_length(&rec), - false); - if (NTDB_PTR_IS_ERR(kptr)) { - ecode = NTDB_PTR_ERR(kptr); - goto fail; - } + entries = ntdb_access_read(ntdb, NTDB_HASH_OFFSET + sizeof(rec), + rec_data_length(&rec), true); + if (NTDB_PTR_IS_ERR(entries)) { + return NTDB_PTR_ERR(entries); + } - k = ntdb_mkdata(kptr, rec_key_length(&rec)); - d = ntdb_mkdata(kptr + k.dsize, - rec_data_length(&rec)); - ecode = check(k, d, data); - ntdb_access_release(ntdb, kptr); - if (ecode != NTDB_SUCCESS) { - goto fail; - } - } + for (i = 0; i < (1 << ntdb->hash_bits); i++) { + ntdb_off_t off = entries[i] & NTDB_OFF_MASK; + if (entries[i] & (1ULL << NTDB_OFF_CHAIN_BIT)) { + ecode = check_hash_chain(ntdb, off, i, + used, num_used, &num_found, + check, data); + } else { + ecode = check_entry(ntdb, entries[i], i, + used, num_used, &num_found, + check, data); + } + if (ecode) { + break; } } - ntdb_access_release(ntdb, hash); - return NTDB_SUCCESS; - -fail: - ntdb_access_release(ntdb, hash); - return ecode; -} + ntdb_access_release(ntdb, entries); -static enum NTDB_ERROR check_hash(struct ntdb_context *ntdb, - ntdb_off_t used[], - size_t num_used, size_t num_other_used, - enum NTDB_ERROR (*check)(NTDB_DATA, NTDB_DATA, void *), - void *data) -{ - /* Free tables and capabilities also show up as used. */ - size_t num_found = num_other_used; - enum NTDB_ERROR ecode; - - ecode = check_hash_tree(ntdb, offsetof(struct ntdb_header, hashtable), - NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS, - 0, 0, used, num_used, &num_found, - check, data); - if (ecode == NTDB_SUCCESS) { - if (num_found != num_used) { - ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, - "ntdb_check: Not all entries" - " are in hash"); - } + if (ecode == NTDB_SUCCESS && num_found != num_used) { + ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, + "ntdb_check: Not all entries are in hash"); } return ecode; } @@ -547,8 +402,7 @@ static enum NTDB_ERROR check_free_table(struct ntdb_context *ntdb, if (rec_magic(&ft.hdr) != NTDB_FTABLE_MAGIC || rec_key_length(&ft.hdr) != 0 - || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr) - || rec_hash(&ft.hdr) != 0) { + || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)) { return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR, "ntdb_check: Invalid header on free table"); } |