diff options
Diffstat (limited to 'lib/ntdb/hash.c')
-rw-r--r-- | lib/ntdb/hash.c | 1050 |
1 files changed, 395 insertions, 655 deletions
diff --git a/lib/ntdb/hash.c b/lib/ntdb/hash.c index e87705b123..69192a119b 100644 --- a/lib/ntdb/hash.c +++ b/lib/ntdb/hash.c @@ -17,68 +17,23 @@ */ #include "private.h" #include <ccan/hash/hash.h> -#include <assert.h> /* Default hash function. */ -uint64_t ntdb_jenkins_hash(const void *key, size_t length, uint64_t seed, +uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed, void *unused) { - uint64_t ret; - /* hash64_stable assumes lower bits are more important; they are a - * slightly better hash. We use the upper bits first, so swap them. */ - ret = hash64_stable((const unsigned char *)key, length, seed); - return (ret >> 32) | (ret << 32); + return hash_stable((const unsigned char *)key, length, seed); } -uint64_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len) +uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len) { return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data); } -uint64_t hash_record(struct ntdb_context *ntdb, ntdb_off_t off) -{ - const struct ntdb_used_record *r; - const void *key; - uint64_t klen, hash; - - r = ntdb_access_read(ntdb, off, sizeof(*r), true); - if (NTDB_PTR_IS_ERR(r)) { - /* FIXME */ - return 0; - } - - klen = rec_key_length(r); - ntdb_access_release(ntdb, r); - - key = ntdb_access_read(ntdb, off + sizeof(*r), klen, false); - if (NTDB_PTR_IS_ERR(key)) { - return 0; - } - - hash = ntdb_hash(ntdb, key, klen); - ntdb_access_release(ntdb, key); - return hash; -} - -/* Get bits from a value. */ -static uint32_t bits_from(uint64_t val, unsigned start, unsigned num) -{ - assert(num <= 32); - return (val >> start) & ((1U << num) - 1); -} - -/* We take bits from the top: that way we can lock whole sections of the hash - * by using lock ranges. */ -static uint32_t use_bits(struct hash_info *h, unsigned num) -{ - h->hash_used += num; - return bits_from(h->h, 64 - h->hash_used, num); -} - static ntdb_bool_err key_matches(struct ntdb_context *ntdb, - const struct ntdb_used_record *rec, - ntdb_off_t off, - const NTDB_DATA *key) + const struct ntdb_used_record *rec, + ntdb_off_t off, + const NTDB_DATA *key) { ntdb_bool_err ret = false; const char *rkey; @@ -102,792 +57,577 @@ static ntdb_bool_err key_matches(struct ntdb_context *ntdb, /* Does entry match? */ static ntdb_bool_err match(struct ntdb_context *ntdb, - struct hash_info *h, - const NTDB_DATA *key, - ntdb_off_t val, - struct ntdb_used_record *rec) + uint32_t hash, + const NTDB_DATA *key, + ntdb_off_t val, + struct ntdb_used_record *rec, + const ntdb_off_t **mapped) { ntdb_off_t off; enum NTDB_ERROR ecode; ntdb->stats.compares++; - /* Desired bucket must match. */ - if (h->home_bucket != (val & NTDB_OFF_HASH_GROUP_MASK)) { - ntdb->stats.compare_wrong_bucket++; - return false; - } /* Top bits of offset == next bits of hash. */ - if (bits_from(val, NTDB_OFF_HASH_EXTRA_BIT, NTDB_OFF_UPPER_STEAL_EXTRA) - != bits_from(h->h, 64 - h->hash_used - NTDB_OFF_UPPER_STEAL_EXTRA, - NTDB_OFF_UPPER_STEAL_EXTRA)) { + if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL) + != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) { ntdb->stats.compare_wrong_offsetbits++; return false; } + /* Unmap before we try to read actual record, which may cause expand */ + if (mapped) { + ntdb_access_release(ntdb, *mapped); + *mapped = NULL; + } + off = val & NTDB_OFF_MASK; ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec)); if (ecode != NTDB_SUCCESS) { return (ntdb_bool_err)ecode; } - if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) { - ntdb->stats.compare_wrong_rechash++; - return false; - } - return key_matches(ntdb, rec, off, key); } -static ntdb_off_t hbucket_off(ntdb_off_t group_start, unsigned bucket) -{ - return group_start - + (bucket % (1 << NTDB_HASH_GROUP_BITS)) * sizeof(ntdb_off_t); -} - -bool is_subhash(ntdb_off_t val) +static bool is_chain(ntdb_off_t val) { - return (val >> NTDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1; + return val & (1ULL << NTDB_OFF_CHAIN_BIT); } -/* FIXME: Guess the depth, don't over-lock! */ -static ntdb_off_t hlock_range(ntdb_off_t group, ntdb_off_t *size) +static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx) { - *size = 1ULL << (64 - (NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS)); - return group << (64 - (NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS)); -} - -static ntdb_off_t COLD find_in_chain(struct ntdb_context *ntdb, - NTDB_DATA key, - ntdb_off_t chain, - struct hash_info *h, - struct ntdb_used_record *rec, - struct traverse_info *tinfo) -{ - ntdb_off_t off, next; - enum NTDB_ERROR ecode; - - /* In case nothing is free, we set these to zero. */ - h->home_bucket = h->found_bucket = 0; - - for (off = chain; off; off = next) { - unsigned int i; - - h->group_start = off; - ecode = ntdb_read_convert(ntdb, off, h->group, sizeof(h->group)); - if (ecode != NTDB_SUCCESS) { - return NTDB_ERR_TO_OFF(ecode); - } - - for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) { - ntdb_off_t recoff; - if (!h->group[i]) { - /* Remember this empty bucket. */ - h->home_bucket = h->found_bucket = i; - continue; - } - - /* We can insert extra bits via add_to_hash - * empty bucket logic. */ - recoff = h->group[i] & NTDB_OFF_MASK; - ecode = ntdb_read_convert(ntdb, recoff, rec, - sizeof(*rec)); - if (ecode != NTDB_SUCCESS) { - return NTDB_ERR_TO_OFF(ecode); - } - - ecode = NTDB_OFF_TO_ERR(key_matches(ntdb, rec, recoff, - &key)); - if (ecode < 0) { - return NTDB_ERR_TO_OFF(ecode); - } - if (ecode == (enum NTDB_ERROR)1) { - h->home_bucket = h->found_bucket = i; - - if (tinfo) { - tinfo->levels[tinfo->num_levels] - .hashtable = off; - tinfo->levels[tinfo->num_levels] - .total_buckets - = 1 << NTDB_HASH_GROUP_BITS; - tinfo->levels[tinfo->num_levels].entry - = i; - tinfo->num_levels++; - } - return recoff; - } - } - next = ntdb_read_off(ntdb, off - + offsetof(struct ntdb_chain, next)); - if (NTDB_OFF_IS_ERR(next)) { - return next; - } - if (next) - next += sizeof(struct ntdb_used_record); - } - return 0; + return base + sizeof(struct ntdb_used_record) + + idx * sizeof(ntdb_off_t); } /* This is the core routine which searches the hashtable for an entry. * On error, no locks are held and -ve is returned. - * Otherwise, hinfo is filled in (and the optional tinfo). + * Otherwise, hinfo is filled in. * If not found, the return value is 0. * If found, the return value is the offset, and *rec is the record. */ ntdb_off_t find_and_lock(struct ntdb_context *ntdb, NTDB_DATA key, int ltype, struct hash_info *h, - struct ntdb_used_record *rec, - struct traverse_info *tinfo) + struct ntdb_used_record *rec) { - uint32_t i, group; - ntdb_off_t hashtable; + ntdb_off_t off, val; + const ntdb_off_t *arr = NULL; + ntdb_len_t i; + bool found_empty; enum NTDB_ERROR ecode; + struct ntdb_used_record chdr; + ntdb_bool_err berr; h->h = ntdb_hash(ntdb, key.dptr, key.dsize); - h->hash_used = 0; - group = use_bits(h, NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS); - h->home_bucket = use_bits(h, NTDB_HASH_GROUP_BITS); - h->hlock_start = hlock_range(group, &h->hlock_range); - ecode = ntdb_lock_hashes(ntdb, h->hlock_start, h->hlock_range, ltype, - NTDB_LOCK_WAIT); + h->table = NTDB_HASH_OFFSET; + h->table_size = 1 << ntdb->hash_bits; + h->bucket = bits_from(h->h, 0, ntdb->hash_bits); + h->old_val = 0; + + ecode = ntdb_lock_hash(ntdb, h->bucket, ltype); if (ecode != NTDB_SUCCESS) { return NTDB_ERR_TO_OFF(ecode); } - hashtable = offsetof(struct ntdb_header, hashtable); - if (tinfo) { - tinfo->toplevel_group = group; - tinfo->num_levels = 1; - tinfo->levels[0].entry = 0; - tinfo->levels[0].hashtable = hashtable - + (group << NTDB_HASH_GROUP_BITS) * sizeof(ntdb_off_t); - tinfo->levels[0].total_buckets = 1 << NTDB_HASH_GROUP_BITS; + off = hbucket_off(h->table, h->bucket); + val = ntdb_read_off(ntdb, off); + if (NTDB_OFF_IS_ERR(val)) { + ecode = NTDB_OFF_TO_ERR(val); + goto fail; } - while (h->hash_used <= 64) { - /* Read in the hash group. */ - h->group_start = hashtable - + group * (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS); - - ecode = ntdb_read_convert(ntdb, h->group_start, &h->group, - sizeof(h->group)); - if (ecode != NTDB_SUCCESS) { - goto fail; - } - - /* Pointer to another hash table? Go down... */ - if (is_subhash(h->group[h->home_bucket])) { - hashtable = (h->group[h->home_bucket] & NTDB_OFF_MASK) - + sizeof(struct ntdb_used_record); - if (tinfo) { - /* When we come back, use *next* bucket */ - tinfo->levels[tinfo->num_levels-1].entry - += h->home_bucket + 1; + /* Directly in hash table? */ + if (!is_chain(val)) { + if (val) { + berr = match(ntdb, h->h, &key, val, rec, NULL); + if (berr < 0) { + ecode = NTDB_OFF_TO_ERR(berr); + goto fail; } - group = use_bits(h, NTDB_SUBLEVEL_HASH_BITS - - NTDB_HASH_GROUP_BITS); - h->home_bucket = use_bits(h, NTDB_HASH_GROUP_BITS); - if (tinfo) { - tinfo->levels[tinfo->num_levels].hashtable - = hashtable; - tinfo->levels[tinfo->num_levels].total_buckets - = 1 << NTDB_SUBLEVEL_HASH_BITS; - tinfo->levels[tinfo->num_levels].entry - = group << NTDB_HASH_GROUP_BITS; - tinfo->num_levels++; + if (berr) { + return val & NTDB_OFF_MASK; } - continue; + /* If you want to insert here, make a chain. */ + h->old_val = val; } + return 0; + } - /* It's in this group: search (until 0 or all searched) */ - for (i = 0, h->found_bucket = h->home_bucket; - i < (1 << NTDB_HASH_GROUP_BITS); - i++, h->found_bucket = ((h->found_bucket+1) - % (1 << NTDB_HASH_GROUP_BITS))) { - ntdb_bool_err berr; - if (is_subhash(h->group[h->found_bucket])) - continue; + /* Nope? Iterate through chain. */ + h->table = val & NTDB_OFF_MASK; - if (!h->group[h->found_bucket]) - break; + ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr)); + if (ecode != NTDB_SUCCESS) { + goto fail; + } + + if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) { + ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, + NTDB_LOG_ERROR, + "find_and_lock:" + " corrupt record %#x at %llu", + rec_magic(&chdr), (long long)off); + goto fail; + } - berr = match(ntdb, h, &key, h->group[h->found_bucket], - rec); + h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t); + + found_empty = false; + for (i = 0; i < h->table_size; i++) { + /* Careful! match has to unmap this if we access a + * record (may cause mmap of database to move. */ + if (!arr) { + arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0), + rec_data_length(&chdr), true); + if (NTDB_PTR_IS_ERR(arr)) { + ecode = NTDB_PTR_ERR(arr); + goto fail; + } + } + + val = arr[i]; + if (val == 0) { + if (!found_empty) { + h->bucket = i; + found_empty = true; + } + } else { + berr = match(ntdb, h->h, &key, val, rec, &arr); if (berr < 0) { ecode = NTDB_OFF_TO_ERR(berr); + if (arr) { + ntdb_access_release(ntdb, arr); + } goto fail; } if (berr) { - if (tinfo) { - tinfo->levels[tinfo->num_levels-1].entry - += h->found_bucket; + /* We found it! */ + h->bucket = i; + off = val & NTDB_OFF_MASK; + if (arr) { + ntdb_access_release(ntdb, arr); } - return h->group[h->found_bucket] & NTDB_OFF_MASK; + return off; } } - /* Didn't find it: h indicates where it would go. */ - return 0; + } + if (!found_empty) { + /* Set to any non-zero value */ + h->old_val = 1; + h->bucket = i; } - return find_in_chain(ntdb, key, hashtable, h, rec, tinfo); + if (arr) { + ntdb_access_release(ntdb, arr); + } + return 0; fail: - ntdb_unlock_hashes(ntdb, h->hlock_start, h->hlock_range, ltype); + ntdb_unlock_hash(ntdb, h->bucket, ltype); return NTDB_ERR_TO_OFF(ecode); } -/* I wrote a simple test, expanding a hash to 2GB, for the following - * cases: - * 1) Expanding all the buckets at once, - * 2) Expanding the bucket we wanted to place the new entry into. - * 3) Expanding the most-populated bucket, - * - * I measured the worst/average/best density during this process. - * 1) 3%/16%/30% - * 2) 4%/20%/38% - * 3) 6%/22%/41% - * - * So we figure out the busiest bucket for the moment. - */ -static unsigned fullest_bucket(struct ntdb_context *ntdb, - const ntdb_off_t *group, - unsigned new_bucket) -{ - unsigned counts[1 << NTDB_HASH_GROUP_BITS] = { 0 }; - unsigned int i, best_bucket; - - /* Count the new entry. */ - counts[new_bucket]++; - best_bucket = new_bucket; - - for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) { - unsigned this_bucket; - - if (is_subhash(group[i])) - continue; - this_bucket = group[i] & NTDB_OFF_HASH_GROUP_MASK; - if (++counts[this_bucket] > counts[best_bucket]) - best_bucket = this_bucket; - } - - return best_bucket; -} - -static bool put_into_group(ntdb_off_t *group, - unsigned bucket, ntdb_off_t encoded) +static ntdb_off_t encode_offset(const struct ntdb_context *ntdb, + ntdb_off_t new_off, uint32_t hash) { - unsigned int i; + ntdb_off_t extra; - for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) { - unsigned b = (bucket + i) % (1 << NTDB_HASH_GROUP_BITS); + assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0); + assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0); + /* We pack extra hash bits into the upper bits of the offset. */ + extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL); + extra <<= (64 - NTDB_OFF_UPPER_STEAL); - if (group[b] == 0) { - group[b] = encoded; - return true; - } - } - return false; + return new_off | extra; } -static void force_into_group(ntdb_off_t *group, - unsigned bucket, ntdb_off_t encoded) +/* Simply overwrite the hash entry we found before. */ +enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb, + const struct hash_info *h, + ntdb_off_t new_off) { - if (!put_into_group(group, bucket, encoded)) - abort(); + return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), + encode_offset(ntdb, new_off, h->h)); } -static ntdb_off_t encode_offset(ntdb_off_t new_off, struct hash_info *h) +enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb, + const struct hash_info *h) { - return h->home_bucket - | new_off - | ((uint64_t)bits_from(h->h, - 64 - h->hash_used - NTDB_OFF_UPPER_STEAL_EXTRA, - NTDB_OFF_UPPER_STEAL_EXTRA) - << NTDB_OFF_HASH_EXTRA_BIT); + return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0); } -/* Simply overwrite the hash entry we found before. */ -enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb, - struct hash_info *h, - ntdb_off_t new_off) -{ - return ntdb_write_off(ntdb, hbucket_off(h->group_start, h->found_bucket), - encode_offset(new_off, h)); -} -/* We slot in anywhere that's empty in the chain. */ -static enum NTDB_ERROR COLD add_to_chain(struct ntdb_context *ntdb, - ntdb_off_t subhash, - ntdb_off_t new_off) +enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb, + const struct hash_info *h, + ntdb_off_t new_off) { - ntdb_off_t entry; enum NTDB_ERROR ecode; + ntdb_off_t chain; + struct ntdb_used_record chdr; + const ntdb_off_t *old; + ntdb_off_t *new; - entry = ntdb_find_zero_off(ntdb, subhash, 1<<NTDB_HASH_GROUP_BITS); - if (NTDB_OFF_IS_ERR(entry)) { - return NTDB_OFF_TO_ERR(entry); + /* We hit an empty bucket during search? That's where it goes. */ + if (!h->old_val) { + return replace_in_hash(ntdb, h, new_off); } - if (entry == 1 << NTDB_HASH_GROUP_BITS) { - ntdb_off_t next; + /* Full at top-level? Create a 2-element chain. */ + if (h->table == NTDB_HASH_OFFSET) { + ntdb_off_t pair[2]; - next = ntdb_read_off(ntdb, subhash - + offsetof(struct ntdb_chain, next)); - if (NTDB_OFF_IS_ERR(next)) { - return NTDB_OFF_TO_ERR(next); - } + /* One element is old value, the other is the new value. */ + pair[0] = h->old_val; + pair[1] = encode_offset(ntdb, new_off, h->h); - if (!next) { - next = alloc(ntdb, 0, sizeof(struct ntdb_chain), 0, - NTDB_CHAIN_MAGIC, false); - if (NTDB_OFF_IS_ERR(next)) - return NTDB_OFF_TO_ERR(next); - ecode = zero_out(ntdb, - next+sizeof(struct ntdb_used_record), - sizeof(struct ntdb_chain)); - if (ecode != NTDB_SUCCESS) { - return ecode; - } - ecode = ntdb_write_off(ntdb, subhash - + offsetof(struct ntdb_chain, - next), - next); - if (ecode != NTDB_SUCCESS) { - return ecode; - } + chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true); + if (NTDB_OFF_IS_ERR(chain)) { + return NTDB_OFF_TO_ERR(chain); } - return add_to_chain(ntdb, next, new_off); + ecode = ntdb_write_convert(ntdb, + chain + + sizeof(struct ntdb_used_record), + pair, sizeof(pair)); + if (ecode == NTDB_SUCCESS) { + ecode = ntdb_write_off(ntdb, + hbucket_off(h->table, h->bucket), + chain + | (1ULL << NTDB_OFF_CHAIN_BIT)); + } + return ecode; } - return ntdb_write_off(ntdb, subhash + entry * sizeof(ntdb_off_t), - new_off); -} - -/* Add into a newly created subhash. */ -static enum NTDB_ERROR add_to_subhash(struct ntdb_context *ntdb, ntdb_off_t subhash, - unsigned hash_used, ntdb_off_t val) -{ - ntdb_off_t off = (val & NTDB_OFF_MASK), *group; - struct hash_info h; - unsigned int gnum; - - h.hash_used = hash_used; - - if (hash_used + NTDB_SUBLEVEL_HASH_BITS > 64) - return add_to_chain(ntdb, subhash, off); - - h.h = hash_record(ntdb, off); - gnum = use_bits(&h, NTDB_SUBLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS); - h.group_start = subhash - + gnum * (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS); - h.home_bucket = use_bits(&h, NTDB_HASH_GROUP_BITS); - - group = ntdb_access_write(ntdb, h.group_start, - sizeof(*group) << NTDB_HASH_GROUP_BITS, true); - if (NTDB_PTR_IS_ERR(group)) { - return NTDB_PTR_ERR(group); + /* Full bucket. Expand. */ + ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr)); + if (ecode != NTDB_SUCCESS) { + return ecode; } - force_into_group(group, h.home_bucket, encode_offset(off, &h)); - return ntdb_access_commit(ntdb, group); -} -static enum NTDB_ERROR expand_group(struct ntdb_context *ntdb, struct hash_info *h) -{ - unsigned bucket, num_vals, i, magic; - size_t subsize; - ntdb_off_t subhash; - ntdb_off_t vals[1 << NTDB_HASH_GROUP_BITS]; - enum NTDB_ERROR ecode; + if (rec_extra_padding(&chdr) >= sizeof(new_off)) { + /* Expand in place. */ + uint64_t dlen = rec_data_length(&chdr); - /* Attach new empty subhash under fullest bucket. */ - bucket = fullest_bucket(ntdb, h->group, h->home_bucket); + ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0, + dlen + sizeof(new_off), + dlen + rec_extra_padding(&chdr)); - if (h->hash_used == 64) { - ntdb->stats.alloc_chain++; - subsize = sizeof(struct ntdb_chain); - magic = NTDB_CHAIN_MAGIC; - } else { - ntdb->stats.alloc_subhash++; - subsize = (sizeof(ntdb_off_t) << NTDB_SUBLEVEL_HASH_BITS); - magic = NTDB_HTABLE_MAGIC; + if (ecode != NTDB_SUCCESS) { + return ecode; + } + /* find_and_lock set up h to point to last bucket. */ + ecode = replace_in_hash(ntdb, h, new_off); + if (ecode != NTDB_SUCCESS) { + return ecode; + } + ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr)); + if (ecode != NTDB_SUCCESS) { + return ecode; + } + /* For futureproofing, we always make the first byte of padding + * a zero. */ + if (rec_extra_padding(&chdr)) { + ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr) + + dlen + sizeof(new_off), + "", 1); + } + return ecode; } - subhash = alloc(ntdb, 0, subsize, 0, magic, false); - if (NTDB_OFF_IS_ERR(subhash)) { - return NTDB_OFF_TO_ERR(subhash); + /* We need to reallocate the chain. */ + chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t), + NTDB_CHAIN_MAGIC, true); + if (NTDB_OFF_IS_ERR(chain)) { + return NTDB_OFF_TO_ERR(chain); } - ecode = zero_out(ntdb, subhash + sizeof(struct ntdb_used_record), - subsize); - if (ecode != NTDB_SUCCESS) { - return ecode; + /* Map both and copy across old buckets. */ + old = ntdb_access_read(ntdb, hbucket_off(h->table, 0), + h->table_size*sizeof(ntdb_off_t), true); + if (NTDB_PTR_IS_ERR(old)) { + return NTDB_PTR_ERR(old); } - - /* Remove any which are destined for bucket or are in wrong place. */ - num_vals = 0; - for (i = 0; i < (1 << NTDB_HASH_GROUP_BITS); i++) { - unsigned home_bucket = h->group[i] & NTDB_OFF_HASH_GROUP_MASK; - if (!h->group[i] || is_subhash(h->group[i])) - continue; - if (home_bucket == bucket || home_bucket != i) { - vals[num_vals++] = h->group[i]; - h->group[i] = 0; - } + new = ntdb_access_write(ntdb, hbucket_off(chain, 0), + (h->table_size + 1)*sizeof(ntdb_off_t), true); + if (NTDB_PTR_IS_ERR(new)) { + ntdb_access_release(ntdb, old); + return NTDB_PTR_ERR(new); } - /* FIXME: This assert is valid, but we do this during unit test :( */ - /* assert(num_vals); */ - - /* Overwrite expanded bucket with subhash pointer. */ - h->group[bucket] = subhash | (1ULL << NTDB_OFF_UPPER_STEAL_SUBHASH_BIT); - /* Point to actual contents of record. */ - subhash += sizeof(struct ntdb_used_record); + memcpy(new, old, h->bucket * sizeof(ntdb_off_t)); + new[h->bucket] = encode_offset(ntdb, new_off, h->h); + ntdb_access_release(ntdb, old); - /* Put values back. */ - for (i = 0; i < num_vals; i++) { - unsigned this_bucket = vals[i] & NTDB_OFF_HASH_GROUP_MASK; - - if (this_bucket == bucket) { - ecode = add_to_subhash(ntdb, subhash, h->hash_used, - vals[i]); - if (ecode != NTDB_SUCCESS) - return ecode; - } else { - /* There should be room to put this back. */ - force_into_group(h->group, this_bucket, vals[i]); - } + ecode = ntdb_access_commit(ntdb, new); + if (ecode != NTDB_SUCCESS) { + return ecode; } - return NTDB_SUCCESS; + + /* Free the old chain. */ + ecode = add_free_record(ntdb, h->table, + sizeof(struct ntdb_used_record) + + rec_data_length(&chdr) + + rec_extra_padding(&chdr), + NTDB_LOCK_WAIT, true); + + /* Replace top-level to point to new chain */ + return ntdb_write_off(ntdb, + hbucket_off(NTDB_HASH_OFFSET, + bits_from(h->h, 0, ntdb->hash_bits)), + chain | (1ULL << NTDB_OFF_CHAIN_BIT)); } -enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb, struct hash_info *h) +/* Traverse support: returns offset of record, or 0 or -ve error. */ +static ntdb_off_t iterate_chain(struct ntdb_context *ntdb, + ntdb_off_t val, + struct hash_info *h) { - unsigned int i, num_movers = 0; - ntdb_off_t movers[1 << NTDB_HASH_GROUP_BITS]; + ntdb_off_t i; + enum NTDB_ERROR ecode; + struct ntdb_used_record chdr; - h->group[h->found_bucket] = 0; - for (i = 1; i < (1 << NTDB_HASH_GROUP_BITS); i++) { - unsigned this_bucket; + /* First load up chain header. */ + h->table = val & NTDB_OFF_MASK; + ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr)); + if (ecode != NTDB_SUCCESS) { + return ecode; + } - this_bucket = (h->found_bucket+i) % (1 << NTDB_HASH_GROUP_BITS); - /* Empty bucket? We're done. */ - if (!h->group[this_bucket]) - break; + if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) { + return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, + NTDB_LOG_ERROR, + "get_table:" + " corrupt record %#x at %llu", + rec_magic(&chdr), + (long long)h->table); + } - /* Ignore subhashes. */ - if (is_subhash(h->group[this_bucket])) - continue; + /* Chain length is implied by data length. */ + h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t); - /* If this one is not happy where it is, we'll move it. */ - if ((h->group[this_bucket] & NTDB_OFF_HASH_GROUP_MASK) - != this_bucket) { - movers[num_movers++] = h->group[this_bucket]; - h->group[this_bucket] = 0; - } + i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket, + h->table_size); + if (NTDB_OFF_IS_ERR(i)) { + return i; } - /* Put back the ones we erased. */ - for (i = 0; i < num_movers; i++) { - force_into_group(h->group, movers[i] & NTDB_OFF_HASH_GROUP_MASK, - movers[i]); + if (i != h->table_size) { + /* Return to next bucket. */ + h->bucket = i + 1; + val = ntdb_read_off(ntdb, hbucket_off(h->table, i)); + if (NTDB_OFF_IS_ERR(val)) { + return val; + } + return val & NTDB_OFF_MASK; } - /* Now we write back the hash group */ - return ntdb_write_convert(ntdb, h->group_start, - h->group, sizeof(h->group)); + /* Go back up to hash table. */ + h->table = NTDB_HASH_OFFSET; + h->table_size = 1 << ntdb->hash_bits; + h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1; + return 0; } -enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb, struct hash_info *h, - ntdb_off_t new_off) +/* Keeps hash locked unless returns 0 or error. */ +static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb, + struct hash_info *h) { + ntdb_off_t val, i; enum NTDB_ERROR ecode; - /* We hit an empty bucket during search? That's where it goes. */ - if (!h->group[h->found_bucket]) { - h->group[h->found_bucket] = encode_offset(new_off, h); - /* Write back the modified group. */ - return ntdb_write_convert(ntdb, h->group_start, - h->group, sizeof(h->group)); - } - - if (h->hash_used > 64) - return add_to_chain(ntdb, h->group_start, new_off); - - /* We're full. Expand. */ - ecode = expand_group(ntdb, h); - if (ecode != NTDB_SUCCESS) { - return ecode; - } + if (h->table != NTDB_HASH_OFFSET) { + /* We're in a chain. */ + i = bits_from(h->h, 0, ntdb->hash_bits); + ecode = ntdb_lock_hash(ntdb, i, F_RDLCK); + if (ecode != NTDB_SUCCESS) { + return NTDB_ERR_TO_OFF(ecode); + } - if (is_subhash(h->group[h->home_bucket])) { - /* We were expanded! */ - ntdb_off_t hashtable; - unsigned int gnum; + /* We dropped lock, bucket might have moved! */ + val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i)); + if (NTDB_OFF_IS_ERR(val)) { + goto unlock; + } - /* Write back the modified group. */ - ecode = ntdb_write_convert(ntdb, h->group_start, h->group, - sizeof(h->group)); - if (ecode != NTDB_SUCCESS) { - return ecode; + /* We don't remove chains: there should still be one there! */ + if (!val || !is_chain(val)) { + ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, + NTDB_LOG_ERROR, + "iterate_hash:" + " vanished hchain %llu at %llu", + (long long)val, + (long long)i); + val = NTDB_ERR_TO_OFF(ecode); + goto unlock; } - /* Move hashinfo down a level. */ - hashtable = (h->group[h->home_bucket] & NTDB_OFF_MASK) - + sizeof(struct ntdb_used_record); - gnum = use_bits(h,NTDB_SUBLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS); - h->home_bucket = use_bits(h, NTDB_HASH_GROUP_BITS); - h->group_start = hashtable - + gnum * (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS); - ecode = ntdb_read_convert(ntdb, h->group_start, &h->group, - sizeof(h->group)); - if (ecode != NTDB_SUCCESS) { - return ecode; + /* Find next bucket in the chain. */ + val = iterate_chain(ntdb, val, h); + if (NTDB_OFF_IS_ERR(val)) { + goto unlock; } - } + if (val != 0) { + return val; + } + ntdb_unlock_hash(ntdb, i, F_RDLCK); - /* Expanding the group must have made room if it didn't choose this - * bucket. */ - if (put_into_group(h->group, h->home_bucket, encode_offset(new_off,h))){ - return ntdb_write_convert(ntdb, h->group_start, - h->group, sizeof(h->group)); + /* OK, we've reset h back to top level. */ } - /* This can happen if all hashes in group (and us) dropped into same - * group in subhash. */ - return add_to_hash(ntdb, h, new_off); -} - -/* Traverse support: returns offset of record, or 0 or -ve error. */ -static ntdb_off_t iterate_hash(struct ntdb_context *ntdb, - struct traverse_info *tinfo) -{ - ntdb_off_t off, val, i; - struct traverse_level *tlevel; - - tlevel = &tinfo->levels[tinfo->num_levels-1]; - -again: - for (i = ntdb_find_nonzero_off(ntdb, tlevel->hashtable, - tlevel->entry, tlevel->total_buckets); - i != tlevel->total_buckets; - i = ntdb_find_nonzero_off(ntdb, tlevel->hashtable, - i+1, tlevel->total_buckets)) { - if (NTDB_OFF_IS_ERR(i)) { - return i; + /* We do this unlocked, then re-check. */ + for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), + h->bucket, h->table_size); + i != h->table_size; + i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), + i+1, h->table_size)) { + ecode = ntdb_lock_hash(ntdb, i, F_RDLCK); + if (ecode != NTDB_SUCCESS) { + return NTDB_ERR_TO_OFF(ecode); } - val = ntdb_read_off(ntdb, tlevel->hashtable+sizeof(ntdb_off_t)*i); + val = ntdb_read_off(ntdb, hbucket_off(h->table, i)); if (NTDB_OFF_IS_ERR(val)) { - return val; + goto unlock; } - off = val & NTDB_OFF_MASK; - - /* This makes the delete-all-in-traverse case work - * (and simplifies our logic a little). */ - if (off == tinfo->prev) + /* Lost race, and it's empty? */ + if (!val) { + ntdb->stats.traverse_val_vanished++; + ntdb_unlock_hash(ntdb, i, F_RDLCK); continue; + } - tlevel->entry = i; - - if (!is_subhash(val)) { - /* Found one. */ - tinfo->prev = off; - return off; + if (!is_chain(val)) { + /* So caller knows what lock to free. */ + h->h = i; + /* Return to next bucket. */ + h->bucket = i + 1; + val &= NTDB_OFF_MASK; + return val; } - /* When we come back, we want the next one */ - tlevel->entry++; - tinfo->num_levels++; - tlevel++; - tlevel->hashtable = off + sizeof(struct ntdb_used_record); - tlevel->entry = 0; - /* Next level is a chain? */ - if (unlikely(tinfo->num_levels == NTDB_MAX_LEVELS + 1)) - tlevel->total_buckets = (1 << NTDB_HASH_GROUP_BITS); - else - tlevel->total_buckets = (1 << NTDB_SUBLEVEL_HASH_BITS); - goto again; - } - - /* Nothing there? */ - if (tinfo->num_levels == 1) - return 0; + /* Start at beginning of chain */ + h->bucket = 0; + h->h = i; - /* Handle chained entries. */ - if (unlikely(tinfo->num_levels == NTDB_MAX_LEVELS + 1)) { - tlevel->hashtable = ntdb_read_off(ntdb, tlevel->hashtable - + offsetof(struct ntdb_chain, - next)); - if (NTDB_OFF_IS_ERR(tlevel->hashtable)) { - return tlevel->hashtable; + val = iterate_chain(ntdb, val, h); + if (NTDB_OFF_IS_ERR(val)) { + goto unlock; } - if (tlevel->hashtable) { - tlevel->hashtable += sizeof(struct ntdb_used_record); - tlevel->entry = 0; - goto again; + if (val != 0) { + return val; } + + /* Otherwise, bucket has been set to i+1 */ + ntdb_unlock_hash(ntdb, i, F_RDLCK); } + return 0; - /* Go back up and keep searching. */ - tinfo->num_levels--; - tlevel--; - goto again; +unlock: + ntdb_unlock_hash(ntdb, i, F_RDLCK); + return val; } /* Return success if we find something, NTDB_ERR_NOEXIST if none. */ enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb, - struct traverse_info *tinfo, - NTDB_DATA *kbuf, size_t *dlen) + struct hash_info *h, + NTDB_DATA *kbuf, size_t *dlen) { - const unsigned group_bits = NTDB_TOPLEVEL_HASH_BITS-NTDB_HASH_GROUP_BITS; - ntdb_off_t hl_start, hl_range, off; + ntdb_off_t off; + struct ntdb_used_record rec; enum NTDB_ERROR ecode; - while (tinfo->toplevel_group < (1 << group_bits)) { - hl_start = (ntdb_off_t)tinfo->toplevel_group - << (64 - group_bits); - hl_range = 1ULL << group_bits; - ecode = ntdb_lock_hashes(ntdb, hl_start, hl_range, F_RDLCK, - NTDB_LOCK_WAIT); - if (ecode != NTDB_SUCCESS) { - return ecode; - } - - off = iterate_hash(ntdb, tinfo); - if (off) { - struct ntdb_used_record rec; + off = lock_and_iterate_hash(ntdb, h); - if (NTDB_OFF_IS_ERR(off)) { - ecode = NTDB_OFF_TO_ERR(off); - goto fail; - } - - ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec)); - if (ecode != NTDB_SUCCESS) { - goto fail; - } - if (rec_magic(&rec) != NTDB_USED_MAGIC) { - ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, - NTDB_LOG_ERROR, - "next_in_hash:" - " corrupt record at %llu", - (long long)off); - goto fail; - } + if (NTDB_OFF_IS_ERR(off)) { + return NTDB_OFF_TO_ERR(off); + } else if (off == 0) { + return NTDB_ERR_NOEXIST; + } - kbuf->dsize = rec_key_length(&rec); - - /* They want data as well? */ - if (dlen) { - *dlen = rec_data_length(&rec); - kbuf->dptr = ntdb_alloc_read(ntdb, - off + sizeof(rec), - kbuf->dsize - + *dlen); - } else { - kbuf->dptr = ntdb_alloc_read(ntdb, - off + sizeof(rec), - kbuf->dsize); - } - ntdb_unlock_hashes(ntdb, hl_start, hl_range, F_RDLCK); - if (NTDB_PTR_IS_ERR(kbuf->dptr)) { - return NTDB_PTR_ERR(kbuf->dptr); - } - return NTDB_SUCCESS; - } + /* The hash for this key is still locked. */ + ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec)); + if (ecode != NTDB_SUCCESS) { + goto unlock; + } + if (rec_magic(&rec) != NTDB_USED_MAGIC) { + ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, + NTDB_LOG_ERROR, + "next_in_hash:" + " corrupt record at %llu", + (long long)off); + goto unlock; + } - ntdb_unlock_hashes(ntdb, hl_start, hl_range, F_RDLCK); + kbuf->dsize = rec_key_length(&rec); - tinfo->toplevel_group++; - tinfo->levels[0].hashtable - += (sizeof(ntdb_off_t) << NTDB_HASH_GROUP_BITS); - tinfo->levels[0].entry = 0; + /* They want data as well? */ + if (dlen) { + *dlen = rec_data_length(&rec); + kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec), + kbuf->dsize + *dlen); + } else { + kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec), + kbuf->dsize); } - return NTDB_ERR_NOEXIST; + if (NTDB_PTR_IS_ERR(kbuf->dptr)) { + ecode = NTDB_PTR_ERR(kbuf->dptr); + goto unlock; + } + ecode = NTDB_SUCCESS; -fail: - ntdb_unlock_hashes(ntdb, hl_start, hl_range, F_RDLCK); +unlock: + ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK); return ecode; } enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb, - struct traverse_info *tinfo, + struct hash_info *h, NTDB_DATA *kbuf, size_t *dlen) { - tinfo->prev = 0; - tinfo->toplevel_group = 0; - tinfo->num_levels = 1; - tinfo->levels[0].hashtable = offsetof(struct ntdb_header, hashtable); - tinfo->levels[0].entry = 0; - tinfo->levels[0].total_buckets = (1 << NTDB_HASH_GROUP_BITS); - - return next_in_hash(ntdb, tinfo, kbuf, dlen); + h->table = NTDB_HASH_OFFSET; + h->table_size = 1 << ntdb->hash_bits; + h->bucket = 0; + + return next_in_hash(ntdb, h, kbuf, dlen); } /* Even if the entry isn't in this hash bucket, you'd have to lock this * bucket to find it. */ -static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb, const NTDB_DATA *key, - int ltype, enum ntdb_lock_flags waitflag, - const char *func) +static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb, + const NTDB_DATA *key, int ltype) { - enum NTDB_ERROR ecode; - uint64_t h = ntdb_hash(ntdb, key->dptr, key->dsize); - ntdb_off_t lockstart, locksize; - unsigned int group, gbits; - - gbits = NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS; - group = bits_from(h, 64 - gbits, gbits); + uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize); - lockstart = hlock_range(group, &locksize); - - ecode = ntdb_lock_hashes(ntdb, lockstart, locksize, ltype, waitflag); - ntdb_trace_1rec(ntdb, func, *key); - return ecode; + return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype); } /* lock/unlock one hash chain. This is meant to be used to reduce contention - it cannot guarantee how many records will be locked */ _PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key) { - return chainlock(ntdb, &key, F_WRLCK, NTDB_LOCK_WAIT, "ntdb_chainlock"); + return chainlock(ntdb, &key, F_WRLCK); } _PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key) { - uint64_t h = ntdb_hash(ntdb, key.dptr, key.dsize); - ntdb_off_t lockstart, locksize; - unsigned int group, gbits; - - gbits = NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS; - group = bits_from(h, 64 - gbits, gbits); + uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize); - lockstart = hlock_range(group, &locksize); - - ntdb_trace_1rec(ntdb, "ntdb_chainunlock", key); - ntdb_unlock_hashes(ntdb, lockstart, locksize, F_WRLCK); + ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK); } -_PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb, NTDB_DATA key) +_PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb, + NTDB_DATA key) { - return chainlock(ntdb, &key, F_RDLCK, NTDB_LOCK_WAIT, - "ntdb_chainlock_read"); + return chainlock(ntdb, &key, F_RDLCK); } _PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key) { - uint64_t h = ntdb_hash(ntdb, key.dptr, key.dsize); - ntdb_off_t lockstart, locksize; - unsigned int group, gbits; - - gbits = NTDB_TOPLEVEL_HASH_BITS - NTDB_HASH_GROUP_BITS; - group = bits_from(h, 64 - gbits, gbits); - - lockstart = hlock_range(group, &locksize); + uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize); - ntdb_trace_1rec(ntdb, "ntdb_chainunlock_read", key); - ntdb_unlock_hashes(ntdb, lockstart, locksize, F_RDLCK); + ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK); } |