/*
Trivial Database 2: hash handling
Copyright (C) Rusty Russell 2010
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, see .
*/
#include "private.h"
#include
/* Default hash function. */
uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
void *unused)
{
return hash_stable((const unsigned char *)key, length, seed);
}
uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
{
return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
}
static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
const struct ntdb_used_record *rec,
ntdb_off_t off,
const NTDB_DATA *key,
const char **rptr)
{
ntdb_bool_err ret = false;
const char *rkey;
if (rec_key_length(rec) != key->dsize) {
ntdb->stats.compare_wrong_keylen++;
return ret;
}
rkey = ntdb_access_read(ntdb, off + sizeof(*rec),
key->dsize + rec_data_length(rec), false);
if (NTDB_PTR_IS_ERR(rkey)) {
return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
}
if (memcmp(rkey, key->dptr, key->dsize) == 0) {
if (rptr) {
*rptr = rkey;
} else {
ntdb_access_release(ntdb, rkey);
}
return true;
}
ntdb->stats.compare_wrong_keycmp++;
ntdb_access_release(ntdb, rkey);
return ret;
}
/* Does entry match? */
static ntdb_bool_err match(struct ntdb_context *ntdb,
uint32_t hash,
const NTDB_DATA *key,
ntdb_off_t val,
struct ntdb_used_record *rec,
const char **rptr,
const ntdb_off_t **mapped)
{
ntdb_off_t off;
enum NTDB_ERROR ecode;
ntdb->stats.compares++;
/* Top bits of offset == next bits of hash. */
if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
!= bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
ntdb->stats.compare_wrong_offsetbits++;
return false;
}
/* Unmap before we try to read actual record, which may cause expand */
if (mapped) {
ntdb_access_release(ntdb, *mapped);
*mapped = NULL;
}
off = val & NTDB_OFF_MASK;
ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
if (ecode != NTDB_SUCCESS) {
return (ntdb_bool_err)ecode;
}
return key_matches(ntdb, rec, off, key, rptr);
}
static bool is_chain(ntdb_off_t val)
{
return val & (1ULL << NTDB_OFF_CHAIN_BIT);
}
static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
{
return base + sizeof(struct ntdb_used_record)
+ idx * sizeof(ntdb_off_t);
}
/* This is the core routine which searches the hashtable for an entry.
* On error, no locks are held and -ve is returned.
* Otherwise, hinfo is filled in.
* If not found, the return value is 0.
* If found, the return value is the offset, and *rec is the record. */
ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
NTDB_DATA key,
int ltype,
struct hash_info *h,
struct ntdb_used_record *rec,
const char **rptr)
{
ntdb_off_t off, val;
const ntdb_off_t *arr = NULL;
ntdb_len_t i;
bool found_empty;
enum NTDB_ERROR ecode;
struct ntdb_used_record chdr;
ntdb_bool_err berr;
h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
h->table = NTDB_HASH_OFFSET;
h->table_size = 1 << ntdb->hash_bits;
h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
h->old_val = 0;
ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
if (ecode != NTDB_SUCCESS) {
return NTDB_ERR_TO_OFF(ecode);
}
off = hbucket_off(h->table, h->bucket);
val = ntdb_read_off(ntdb, off);
if (NTDB_OFF_IS_ERR(val)) {
ecode = NTDB_OFF_TO_ERR(val);
goto fail;
}
/* Directly in hash table? */
if (!likely(is_chain(val))) {
if (val) {
berr = match(ntdb, h->h, &key, val, rec, rptr, NULL);
if (berr < 0) {
ecode = NTDB_OFF_TO_ERR(berr);
goto fail;
}
if (berr) {
return val & NTDB_OFF_MASK;
}
/* If you want to insert here, make a chain. */
h->old_val = val;
}
return 0;
}
/* Nope? Iterate through chain. */
h->table = val & NTDB_OFF_MASK;
ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
if (ecode != NTDB_SUCCESS) {
goto fail;
}
if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"find_and_lock:"
" corrupt record %#x at %llu",
rec_magic(&chdr), (long long)off);
goto fail;
}
h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
found_empty = false;
for (i = 0; i < h->table_size; i++) {
/* Careful! match has to unmap this if we access a
* record (may cause mmap of database to move. */
if (!arr) {
arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
rec_data_length(&chdr), true);
if (NTDB_PTR_IS_ERR(arr)) {
ecode = NTDB_PTR_ERR(arr);
goto fail;
}
}
val = arr[i];
if (val == 0) {
if (!found_empty) {
h->bucket = i;
found_empty = true;
}
} else {
berr = match(ntdb, h->h, &key, val, rec, rptr, &arr);
if (berr < 0) {
ecode = NTDB_OFF_TO_ERR(berr);
if (arr) {
ntdb_access_release(ntdb, arr);
}
goto fail;
}
if (berr) {
/* We found it! */
h->bucket = i;
off = val & NTDB_OFF_MASK;
if (arr) {
ntdb_access_release(ntdb, arr);
}
return off;
}
}
}
if (!found_empty) {
/* Set to any non-zero value */
h->old_val = 1;
h->bucket = i;
}
if (arr) {
ntdb_access_release(ntdb, arr);
}
return 0;
fail:
ntdb_unlock_hash(ntdb, h->bucket, ltype);
return NTDB_ERR_TO_OFF(ecode);
}
static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
ntdb_off_t new_off, uint32_t hash)
{
ntdb_off_t extra;
assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
/* We pack extra hash bits into the upper bits of the offset. */
extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
extra <<= (64 - NTDB_OFF_UPPER_STEAL);
return new_off | extra;
}
/* Simply overwrite the hash entry we found before. */
enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
const struct hash_info *h,
ntdb_off_t new_off)
{
return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
encode_offset(ntdb, new_off, h->h));
}
enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
const struct hash_info *h)
{
return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
}
enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
const struct hash_info *h,
ntdb_off_t new_off)
{
enum NTDB_ERROR ecode;
ntdb_off_t chain;
struct ntdb_used_record chdr;
const ntdb_off_t *old;
ntdb_off_t *new;
/* We hit an empty bucket during search? That's where it goes. */
if (!h->old_val) {
return replace_in_hash(ntdb, h, new_off);
}
/* Full at top-level? Create a 2-element chain. */
if (h->table == NTDB_HASH_OFFSET) {
ntdb_off_t pair[2];
/* One element is old value, the other is the new value. */
pair[0] = h->old_val;
pair[1] = encode_offset(ntdb, new_off, h->h);
chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
if (NTDB_OFF_IS_ERR(chain)) {
return NTDB_OFF_TO_ERR(chain);
}
ecode = ntdb_write_convert(ntdb,
chain
+ sizeof(struct ntdb_used_record),
pair, sizeof(pair));
if (ecode == NTDB_SUCCESS) {
ecode = ntdb_write_off(ntdb,
hbucket_off(h->table, h->bucket),
chain
| (1ULL << NTDB_OFF_CHAIN_BIT));
}
return ecode;
}
/* Full bucket. Expand. */
ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
/* Expand in place. */
uint64_t dlen = rec_data_length(&chdr);
ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
dlen + sizeof(new_off),
dlen + rec_extra_padding(&chdr));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
/* find_and_lock set up h to point to last bucket. */
ecode = replace_in_hash(ntdb, h, new_off);
if (ecode != NTDB_SUCCESS) {
return ecode;
}
ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
/* For futureproofing, we always make the first byte of padding
* a zero. */
if (rec_extra_padding(&chdr)) {
ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
+ dlen + sizeof(new_off),
"", 1);
}
return ecode;
}
/* We need to reallocate the chain. */
chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
NTDB_CHAIN_MAGIC, true);
if (NTDB_OFF_IS_ERR(chain)) {
return NTDB_OFF_TO_ERR(chain);
}
/* Map both and copy across old buckets. */
old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
h->table_size*sizeof(ntdb_off_t), true);
if (NTDB_PTR_IS_ERR(old)) {
return NTDB_PTR_ERR(old);
}
new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
(h->table_size + 1)*sizeof(ntdb_off_t), true);
if (NTDB_PTR_IS_ERR(new)) {
ntdb_access_release(ntdb, old);
return NTDB_PTR_ERR(new);
}
memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
new[h->bucket] = encode_offset(ntdb, new_off, h->h);
ntdb_access_release(ntdb, old);
ecode = ntdb_access_commit(ntdb, new);
if (ecode != NTDB_SUCCESS) {
return ecode;
}
/* Free the old chain. */
ecode = add_free_record(ntdb, h->table,
sizeof(struct ntdb_used_record)
+ rec_data_length(&chdr)
+ rec_extra_padding(&chdr),
NTDB_LOCK_WAIT, true);
/* Replace top-level to point to new chain */
return ntdb_write_off(ntdb,
hbucket_off(NTDB_HASH_OFFSET,
bits_from(h->h, 0, ntdb->hash_bits)),
chain | (1ULL << NTDB_OFF_CHAIN_BIT));
}
/* Traverse support: returns offset of record, or 0 or -ve error. */
static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
ntdb_off_t val,
struct hash_info *h)
{
ntdb_off_t i;
enum NTDB_ERROR ecode;
struct ntdb_used_record chdr;
/* First load up chain header. */
h->table = val & NTDB_OFF_MASK;
ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"get_table:"
" corrupt record %#x at %llu",
rec_magic(&chdr),
(long long)h->table);
}
/* Chain length is implied by data length. */
h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
h->table_size);
if (NTDB_OFF_IS_ERR(i)) {
return i;
}
if (i != h->table_size) {
/* Return to next bucket. */
h->bucket = i + 1;
val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
if (NTDB_OFF_IS_ERR(val)) {
return val;
}
return val & NTDB_OFF_MASK;
}
/* Go back up to hash table. */
h->table = NTDB_HASH_OFFSET;
h->table_size = 1 << ntdb->hash_bits;
h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
return 0;
}
/* Keeps hash locked unless returns 0 or error. */
static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
struct hash_info *h)
{
ntdb_off_t val, i;
enum NTDB_ERROR ecode;
if (h->table != NTDB_HASH_OFFSET) {
/* We're in a chain. */
i = bits_from(h->h, 0, ntdb->hash_bits);
ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
if (ecode != NTDB_SUCCESS) {
return NTDB_ERR_TO_OFF(ecode);
}
/* We dropped lock, bucket might have moved! */
val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
if (NTDB_OFF_IS_ERR(val)) {
goto unlock;
}
/* We don't remove chains: there should still be one there! */
if (!val || !is_chain(val)) {
ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"iterate_hash:"
" vanished hchain %llu at %llu",
(long long)val,
(long long)i);
val = NTDB_ERR_TO_OFF(ecode);
goto unlock;
}
/* Find next bucket in the chain. */
val = iterate_chain(ntdb, val, h);
if (NTDB_OFF_IS_ERR(val)) {
goto unlock;
}
if (val != 0) {
return val;
}
ntdb_unlock_hash(ntdb, i, F_RDLCK);
/* OK, we've reset h back to top level. */
}
/* We do this unlocked, then re-check. */
for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
h->bucket, h->table_size);
i != h->table_size;
i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
i+1, h->table_size)) {
ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
if (ecode != NTDB_SUCCESS) {
return NTDB_ERR_TO_OFF(ecode);
}
val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
if (NTDB_OFF_IS_ERR(val)) {
goto unlock;
}
/* Lost race, and it's empty? */
if (!val) {
ntdb->stats.traverse_val_vanished++;
ntdb_unlock_hash(ntdb, i, F_RDLCK);
continue;
}
if (!is_chain(val)) {
/* So caller knows what lock to free. */
h->h = i;
/* Return to next bucket. */
h->bucket = i + 1;
val &= NTDB_OFF_MASK;
return val;
}
/* Start at beginning of chain */
h->bucket = 0;
h->h = i;
val = iterate_chain(ntdb, val, h);
if (NTDB_OFF_IS_ERR(val)) {
goto unlock;
}
if (val != 0) {
return val;
}
/* Otherwise, bucket has been set to i+1 */
ntdb_unlock_hash(ntdb, i, F_RDLCK);
}
return 0;
unlock:
ntdb_unlock_hash(ntdb, i, F_RDLCK);
return val;
}
/* Return success if we find something, NTDB_ERR_NOEXIST if none. */
enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
struct hash_info *h,
NTDB_DATA *kbuf, size_t *dlen)
{
ntdb_off_t off;
struct ntdb_used_record rec;
enum NTDB_ERROR ecode;
off = lock_and_iterate_hash(ntdb, h);
if (NTDB_OFF_IS_ERR(off)) {
return NTDB_OFF_TO_ERR(off);
} else if (off == 0) {
return NTDB_ERR_NOEXIST;
}
/* The hash for this key is still locked. */
ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
if (ecode != NTDB_SUCCESS) {
goto unlock;
}
if (rec_magic(&rec) != NTDB_USED_MAGIC) {
ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"next_in_hash:"
" corrupt record at %llu",
(long long)off);
goto unlock;
}
kbuf->dsize = rec_key_length(&rec);
/* They want data as well? */
if (dlen) {
*dlen = rec_data_length(&rec);
kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
kbuf->dsize + *dlen);
} else {
kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
kbuf->dsize);
}
if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
ecode = NTDB_PTR_ERR(kbuf->dptr);
goto unlock;
}
ecode = NTDB_SUCCESS;
unlock:
ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
return ecode;
}
enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
struct hash_info *h,
NTDB_DATA *kbuf, size_t *dlen)
{
h->table = NTDB_HASH_OFFSET;
h->table_size = 1 << ntdb->hash_bits;
h->bucket = 0;
return next_in_hash(ntdb, h, kbuf, dlen);
}
/* Even if the entry isn't in this hash bucket, you'd have to lock this
* bucket to find it. */
static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
const NTDB_DATA *key, int ltype)
{
uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
}
/* lock/unlock one hash chain. This is meant to be used to reduce
contention - it cannot guarantee how many records will be locked */
_PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
{
return chainlock(ntdb, &key, F_WRLCK);
}
_PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
{
uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
}
_PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
NTDB_DATA key)
{
return chainlock(ntdb, &key, F_RDLCK);
}
_PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
{
uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);
}