diff options
Diffstat (limited to 'source3/lib/tdb/common')
-rw-r--r-- | source3/lib/tdb/common/freelist.c | 186 | ||||
-rw-r--r-- | source3/lib/tdb/common/io.c | 20 | ||||
-rw-r--r-- | source3/lib/tdb/common/lock.c | 7 | ||||
-rw-r--r-- | source3/lib/tdb/common/open.c | 17 | ||||
-rw-r--r-- | source3/lib/tdb/common/tdb.c | 77 | ||||
-rw-r--r-- | source3/lib/tdb/common/tdb_private.h | 8 | ||||
-rw-r--r-- | source3/lib/tdb/common/transaction.c | 412 | ||||
-rw-r--r-- | source3/lib/tdb/common/traverse.c | 3 |
8 files changed, 469 insertions, 261 deletions
diff --git a/source3/lib/tdb/common/freelist.c b/source3/lib/tdb/common/freelist.c index b109643f23..2f2a4c379b 100644 --- a/source3/lib/tdb/common/freelist.c +++ b/source3/lib/tdb/common/freelist.c @@ -27,6 +27,12 @@ #include "tdb_private.h" +/* 'right' merges can involve O(n^2) cost when combined with a + traverse, so they are disabled until we find a way to do them in + O(1) time +*/ +#define USE_RIGHT_MERGES 0 + /* read a freelist record and check for simple errors */ int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct *rec) { @@ -56,7 +62,7 @@ int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct } - +#if USE_RIGHT_MERGES /* Remove an element from the freelist. Must have alloc lock. */ static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next) { @@ -75,6 +81,7 @@ static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_ TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%d\n", off)); return TDB_ERRCODE(TDB_ERR_CORRUPT, -1); } +#endif /* update a record tailer (must hold allocation lock) */ @@ -93,8 +100,6 @@ static int update_tailer(struct tdb_context *tdb, tdb_off_t offset, neccessary. */ int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec) { - tdb_off_t right, left; - /* Allocation and tailer lock */ if (tdb_lock(tdb, -1, F_WRLCK) != 0) return -1; @@ -105,9 +110,10 @@ int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec) goto fail; } +#if USE_RIGHT_MERGES /* Look right first (I'm an Australian, dammit) */ - right = offset + sizeof(*rec) + rec->rec_len; - if (right + sizeof(*rec) <= tdb->map_size) { + if (offset + sizeof(*rec) + rec->rec_len + sizeof(*rec) <= tdb->map_size) { + tdb_off_t right = offset + sizeof(*rec) + rec->rec_len; struct list_struct r; if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) { @@ -122,13 +128,18 @@ int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec) goto left; } rec->rec_len += sizeof(r) + r.rec_len; + if (update_tailer(tdb, offset, rec) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset)); + goto fail; + } } } - left: +#endif + /* Look left */ - left = offset - sizeof(tdb_off_t); - if (left > TDB_DATA_START(tdb->header.hash_size)) { + if (offset - sizeof(tdb_off_t) > TDB_DATA_START(tdb->header.hash_size)) { + tdb_off_t left = offset - sizeof(tdb_off_t); struct list_struct l; tdb_off_t leftsize; @@ -145,7 +156,12 @@ left: left = offset - leftsize; - /* Now read in record */ + if (leftsize > offset || + left < TDB_DATA_START(tdb->header.hash_size)) { + goto update; + } + + /* Now read in the left record */ if (tdb->methods->tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left read failed at %u (%u)\n", left, leftsize)); goto update; @@ -153,21 +169,24 @@ left: /* If it's free, expand to include it. */ if (l.magic == TDB_FREE_MAGIC) { - if (remove_from_freelist(tdb, left, l.next) == -1) { - TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left free failed at %u\n", left)); - goto update; - } else { - offset = left; - rec->rec_len += leftsize; + /* we now merge the new record into the left record, rather than the other + way around. This makes the operation O(1) instead of O(n). This change + prevents traverse from being O(n^2) after a lot of deletes */ + l.rec_len += sizeof(*rec) + rec->rec_len; + if (tdb_rec_write(tdb, left, &l) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_left failed at %u\n", left)); + goto fail; + } + if (update_tailer(tdb, left, &l) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset)); + goto fail; } + tdb_unlock(tdb, -1, F_WRLCK); + return 0; } } update: - if (update_tailer(tdb, offset, rec) == -1) { - TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset)); - goto fail; - } /* Now, prepend to free list */ rec->magic = TDB_FREE_MAGIC; @@ -189,62 +208,61 @@ update: } + /* the core of tdb_allocate - called when we have decided which free list entry to use + + Note that we try to allocate by grabbing data from the end of an existing record, + not the beginning. This is so the left merge in a free is more likely to be + able to free up the record without fragmentation */ -static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, tdb_len_t length, tdb_off_t rec_ptr, - struct list_struct *rec, tdb_off_t last_ptr) +static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, + tdb_len_t length, tdb_off_t rec_ptr, + struct list_struct *rec, tdb_off_t last_ptr) { - struct list_struct newrec; - tdb_off_t newrec_ptr; +#define MIN_REC_SIZE (sizeof(struct list_struct) + sizeof(tdb_off_t) + 8) - memset(&newrec, '\0', sizeof(newrec)); + if (rec->rec_len < length + MIN_REC_SIZE) { + /* we have to grab the whole record */ - /* found it - now possibly split it up */ - if (rec->rec_len > length + MIN_REC_SIZE) { - /* Length of left piece */ - length = TDB_ALIGN(length, TDB_ALIGNMENT); - - /* Right piece to go on free list */ - newrec.rec_len = rec->rec_len - (sizeof(*rec) + length); - newrec_ptr = rec_ptr + sizeof(*rec) + length; - - /* And left record is shortened */ - rec->rec_len = length; - } else { - newrec_ptr = 0; + /* unlink it from the previous record */ + if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) { + return 0; + } + + /* mark it not free */ + rec->magic = TDB_MAGIC; + if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { + return 0; + } + return rec_ptr; + } + + /* we're going to just shorten the existing record */ + rec->rec_len -= (length + sizeof(*rec)); + if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { + return 0; } - - /* Remove allocated record from the free list */ - if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) { + if (update_tailer(tdb, rec_ptr, rec) == -1) { return 0; } - - /* Update header: do this before we drop alloc - lock, otherwise tdb_free() might try to - merge with us, thinking we're free. - (Thanks Jeremy Allison). */ + + /* and setup the new record */ + rec_ptr += sizeof(*rec) + rec->rec_len; + + memset(rec, '\0', sizeof(*rec)); + rec->rec_len = length; rec->magic = TDB_MAGIC; + if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { return 0; } - - /* Did we create new block? */ - if (newrec_ptr) { - /* Update allocated record tailer (we - shortened it). */ - if (update_tailer(tdb, rec_ptr, rec) == -1) { - return 0; - } - - /* Free new record */ - if (tdb_free(tdb, newrec_ptr, &newrec) == -1) { - return 0; - } + + if (update_tailer(tdb, rec_ptr, rec) == -1) { + return 0; } - - /* all done - return the new record offset */ + return rec_ptr; } @@ -261,12 +279,14 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st tdb_off_t rec_ptr, last_ptr; tdb_len_t rec_len; } bestfit; + float multiplier = 1.0; if (tdb_lock(tdb, -1, F_WRLCK) == -1) return 0; /* Extra bytes required for tailer */ length += sizeof(tdb_off_t); + length = TDB_ALIGN(length, TDB_ALIGNMENT); again: last_ptr = FREELIST_TOP; @@ -295,18 +315,27 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st bestfit.rec_len = rec->rec_len; bestfit.rec_ptr = rec_ptr; bestfit.last_ptr = last_ptr; - /* consider a fit to be good enough if - we aren't wasting more than half - the space */ - if (bestfit.rec_len < 2*length) { - break; - } } } /* move to the next record */ last_ptr = rec_ptr; rec_ptr = rec->next; + + /* if we've found a record that is big enough, then + stop searching if its also not too big. The + definition of 'too big' changes as we scan + through */ + if (bestfit.rec_len > 0 && + bestfit.rec_len < length * multiplier) { + break; + } + + /* this multiplier means we only extremely rarely + search more than 50 or so records. At 50 records we + accept records up to 11 times larger than what we + want */ + multiplier *= 1.05; } if (bestfit.rec_ptr != 0) { @@ -314,7 +343,8 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st goto fail; } - newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr); + newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, + rec, bestfit.last_ptr); tdb_unlock(tdb, -1, F_WRLCK); return newrec_ptr; } @@ -328,3 +358,25 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st return 0; } + + +/* + return the size of the freelist - used to decide if we should repack +*/ +int tdb_freelist_size(struct tdb_context *tdb) +{ + tdb_off_t ptr; + int count=0; + + if (tdb_lock(tdb, -1, F_RDLCK) == -1) { + return -1; + } + + ptr = FREELIST_TOP; + while (tdb_ofs_read(tdb, ptr, &ptr) == 0 && ptr != 0) { + count++; + } + + tdb_unlock(tdb, -1, F_RDLCK); + return count; +} diff --git a/source3/lib/tdb/common/io.c b/source3/lib/tdb/common/io.c index 8ab0768883..172ab69d8c 100644 --- a/source3/lib/tdb/common/io.c +++ b/source3/lib/tdb/common/io.c @@ -101,8 +101,8 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off, off+written); } if (written == -1) { - /* Ensure ecode is set for log fn. */ - tdb->ecode = TDB_ERR_IO; + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_IO; TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d " "len=%d (%s)\n", off, len, strerror(errno))); return TDB_ERRCODE(TDB_ERR_IO, -1); @@ -111,8 +111,8 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off, "write %d bytes at %d in two attempts\n", len, off)); errno = ENOSPC; - return TDB_ERRCODE(TDB_ERR_IO, -1); - } + return TDB_ERRCODE(TDB_ERR_IO, -1); + } } return 0; } @@ -230,7 +230,7 @@ void tdb_mmap(struct tdb_context *tdb) says to use for mmap expansion */ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition) { - char buf[1024]; + char buf[8192]; if (tdb->read_only || tdb->traverse_read) { tdb->ecode = TDB_ERR_RDONLY; @@ -294,7 +294,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad int tdb_expand(struct tdb_context *tdb, tdb_off_t size) { struct list_struct rec; - tdb_off_t offset; + tdb_off_t offset, new_size; if (tdb_lock(tdb, -1, F_WRLCK) == -1) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n")); @@ -304,9 +304,11 @@ int tdb_expand(struct tdb_context *tdb, tdb_off_t size) /* must know about any previous expansions by another process */ tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1); - /* always make room for at least 10 more records, and round - the database up to a multiple of the page size */ - size = TDB_ALIGN(tdb->map_size + size*10, tdb->page_size) - tdb->map_size; + /* always make room for at least 100 more records, and at + least 25% more space. Round the database up to a multiple + of the page size */ + new_size = MAX(tdb->map_size + size*100, tdb->map_size * 1.25); + size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size; if (!(tdb->flags & TDB_INTERNAL)) tdb_munmap(tdb); diff --git a/source3/lib/tdb/common/lock.c b/source3/lib/tdb/common/lock.c index e3fe888c46..f156c0fa7b 100644 --- a/source3/lib/tdb/common/lock.c +++ b/source3/lib/tdb/common/lock.c @@ -505,6 +505,9 @@ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key) /* record lock stops delete underneath */ int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off) { + if (tdb->global_lock.count) { + return 0; + } return off ? tdb->methods->tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0, 1) : 0; } @@ -537,6 +540,10 @@ int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off) struct tdb_traverse_lock *i; uint32_t count = 0; + if (tdb->global_lock.count) { + return 0; + } + if (off == 0) return 0; for (i = &tdb->travlocks; i; i = i->next) diff --git a/source3/lib/tdb/common/open.c b/source3/lib/tdb/common/open.c index 0bd1c91a5e..b19e4cea29 100644 --- a/source3/lib/tdb/common/open.c +++ b/source3/lib/tdb/common/open.c @@ -35,7 +35,7 @@ static struct tdb_context *tdbs = NULL; static unsigned int default_tdb_hash(TDB_DATA *key) { uint32_t value; /* Used to compute the hash value. */ - uint32_t i; /* Used to cycle through random values. */ + uint32_t i; /* Used to cycle through random values. */ /* Set the initial value from the key size. */ for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++) @@ -90,7 +90,7 @@ static int tdb_new_database(struct tdb_context *tdb, int hash_size) size -= written; written = write(tdb->fd, newdb+written, size); if (written == size) { - ret = 0; + ret = 0; } else if (written >= 0) { /* a second incomplete write - we give up. * guessing the errno... */ @@ -152,6 +152,7 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, int rev = 0, locked = 0; unsigned char *vp; uint32_t vertest; + unsigned v; if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) { /* Can't log this */ @@ -178,9 +179,7 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, tdb->page_size = 0x2000; } - if (open_flags & TDB_VOLATILE) { - tdb->max_dead_records = 5; - } + tdb->max_dead_records = (tdb_flags & TDB_VOLATILE) ? 5 : 0; if ((open_flags & O_ACCMODE) == O_WRONLY) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't open tdb %s write-only\n", @@ -215,6 +214,10 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, goto fail; /* errno set by open(2) */ } + /* on exec, don't inherit the fd */ + v = fcntl(tdb->fd, F_GETFD, 0); + fcntl(tdb->fd, F_SETFD, v | FD_CLOEXEC); + /* ensure there is only one process initialising at once */ if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get global lock on %s: %s\n", @@ -224,6 +227,7 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, /* we need to zero database if we are the only one with it open */ if ((tdb_flags & TDB_CLEAR_IF_FIRST) && + (!tdb->read_only) && (locked = (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0, 1) == 0))) { open_flags |= O_CREAT; if (ftruncate(tdb->fd, 0) == -1) { @@ -242,7 +246,7 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, /* its not a valid database - possibly initialise it */ if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) { if (errno == 0) { - errno = EIO; /* ie bad format or something */ + errno = EIO; /* ie bad format or something */ } goto fail; } @@ -283,7 +287,6 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, tdb->map_size = st.st_size; tdb->device = st.st_dev; tdb->inode = st.st_ino; - tdb->max_dead_records = 0; tdb_mmap(tdb); if (locked) { if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0, 1) == -1) { diff --git a/source3/lib/tdb/common/tdb.c b/source3/lib/tdb/common/tdb.c index 0e9d1dbd74..ea5d9ccc60 100644 --- a/source3/lib/tdb/common/tdb.c +++ b/source3/lib/tdb/common/tdb.c @@ -102,8 +102,7 @@ static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, } /* As tdb_find, but if you succeed, keep the lock */ -tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, - uint32_t hash, int locktype, +tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype, struct list_struct *rec) { uint32_t rec_ptr; @@ -237,14 +236,15 @@ int tdb_exists(struct tdb_context *tdb, TDB_DATA key) } /* actually delete an entry in the database given the offset */ -int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct*rec) +int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct *rec) { tdb_off_t last_ptr, i; struct list_struct lastrec; if (tdb->read_only || tdb->traverse_read) return -1; - if (tdb_write_lock_record(tdb, rec_ptr) == -1) { + if (tdb->traverse_write != 0 || + tdb_write_lock_record(tdb, rec_ptr) == -1) { /* Someone traversing here: mark it as dead */ rec->magic = TDB_DEAD_MAGIC; return tdb_rec_write(tdb, rec_ptr, rec); @@ -666,6 +666,16 @@ int tdb_get_flags(struct tdb_context *tdb) return tdb->flags; } +void tdb_add_flags(struct tdb_context *tdb, unsigned flags) +{ + tdb->flags |= flags; +} + +void tdb_remove_flags(struct tdb_context *tdb, unsigned flags) +{ + tdb->flags &= ~flags; +} + /* enable sequence number handling on an open tdb @@ -674,3 +684,62 @@ void tdb_enable_seqnum(struct tdb_context *tdb) { tdb->flags |= TDB_SEQNUM; } + + +/* + wipe the entire database, deleting all records. This can be done + very fast by using a global lock. The entire data portion of the + file becomes a single entry in the freelist. + */ +int tdb_wipe_all(struct tdb_context *tdb) +{ + int i; + tdb_off_t offset = 0; + ssize_t data_len; + + if (tdb_lockall(tdb) != 0) { + return -1; + } + + /* wipe the hashes */ + for (i=0;i<tdb->header.hash_size;i++) { + if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i)); + goto failed; + } + } + + /* wipe the freelist */ + if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n")); + goto failed; + } + + if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &offset) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write recovery head\n")); + goto failed; + } + + /* add all the rest of the file to the freelist */ + data_len = (tdb->map_size - TDB_DATA_START(tdb->header.hash_size)) - sizeof(struct list_struct); + if (data_len > 0) { + struct list_struct rec; + memset(&rec,'\0',sizeof(rec)); + rec.rec_len = data_len; + if (tdb_free(tdb, TDB_DATA_START(tdb->header.hash_size), &rec) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to add free record\n")); + goto failed; + } + } + + if (tdb_unlockall(tdb) != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n")); + goto failed; + } + + return 0; + +failed: + tdb_unlockall(tdb); + return -1; +} diff --git a/source3/lib/tdb/common/tdb_private.h b/source3/lib/tdb/common/tdb_private.h index 58c30c1706..ffac89ff0e 100644 --- a/source3/lib/tdb/common/tdb_private.h +++ b/source3/lib/tdb/common/tdb_private.h @@ -38,6 +38,10 @@ typedef uint32_t tdb_len_t; typedef uint32_t tdb_off_t; +#ifndef offsetof +#define offsetof(t,f) ((unsigned int)&((t *)0)->f) +#endif + #define TDB_MAGIC_FOOD "TDB file\n" #define TDB_VERSION (0x26011967 + 6) #define TDB_MAGIC (0x26011999U) @@ -45,7 +49,6 @@ typedef uint32_t tdb_off_t; #define TDB_DEAD_MAGIC (0xFEE1DEAD) #define TDB_RECOVERY_MAGIC (0xf53bc0e7U) #define TDB_ALIGNMENT 4 -#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT) #define DEFAULT_HASH_SIZE 131 #define FREELIST_TOP (sizeof(struct tdb_header)) #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1)) @@ -54,7 +57,7 @@ typedef uint32_t tdb_off_t; #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r)) #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off_t)) #define TDB_HASHTABLE_SIZE(tdb) ((tdb->header.hash_size+1)*sizeof(tdb_off_t)) -#define TDB_DATA_START(hash_size) TDB_HASH_TOP(hash_size-1) +#define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + sizeof(tdb_off_t)) #define TDB_RECOVERY_HEAD offsetof(struct tdb_header, recovery_start) #define TDB_SEQNUM_OFS offsetof(struct tdb_header, sequence_number) #define TDB_PAD_BYTE 0x42 @@ -144,6 +147,7 @@ struct tdb_context { tdb_len_t map_size; /* how much space has been mapped */ int read_only; /* opened read-only */ int traverse_read; /* read-only traversal */ + int traverse_write; /* read-write traversal */ struct tdb_lock_type global_lock; int num_lockrecs; struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */ diff --git a/source3/lib/tdb/common/transaction.c b/source3/lib/tdb/common/transaction.c index 7eaacf7a16..4e2127be64 100644 --- a/source3/lib/tdb/common/transaction.c +++ b/source3/lib/tdb/common/transaction.c @@ -87,12 +87,6 @@ */ -struct tdb_transaction_el { - struct tdb_transaction_el *next, *prev; - tdb_off_t offset; - tdb_len_t length; - unsigned char *data; -}; /* hold the context of any current transaction @@ -105,12 +99,12 @@ struct tdb_transaction { /* the original io methods - used to do IOs to the real db */ const struct tdb_methods *io_methods; - /* the list of transaction elements. We use a doubly linked - list with a last pointer to allow us to keep the list - ordered, with first element at the front of the list. It - needs to be doubly linked as the read/write traversals need - to be backwards, while the commit needs to be forwards */ - struct tdb_transaction_el *elements, *elements_last; + /* the list of transaction blocks. When a block is first + written to, it gets created in this list */ + uint8_t **blocks; + uint32_t num_blocks; + uint32_t block_size; /* bytes in each block */ + uint32_t last_block_size; /* number of valid bytes in the last block */ /* non-zero when an internal transaction error has occurred. All write operations will then fail until the @@ -134,52 +128,48 @@ struct tdb_transaction { static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf, tdb_len_t len, int cv) { - struct tdb_transaction_el *el; - - /* we need to walk the list backwards to get the most recent data */ - for (el=tdb->transaction->elements_last;el;el=el->prev) { - tdb_len_t partial; + uint32_t blk; - if (off+len <= el->offset) { - continue; - } - if (off >= el->offset + el->length) { - continue; + /* break it down into block sized ops */ + while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) { + tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size); + if (transaction_read(tdb, off, buf, len2, cv) != 0) { + return -1; } + len -= len2; + off += len2; + buf = (void *)(len2 + (char *)buf); + } - /* an overlapping read - needs to be split into up to - 2 reads and a memcpy */ - if (off < el->offset) { - partial = el->offset - off; - if (transaction_read(tdb, off, buf, partial, cv) != 0) { - goto fail; - } - len -= partial; - off += partial; - buf = (void *)(partial + (char *)buf); - } - if (off + len <= el->offset + el->length) { - partial = len; - } else { - partial = el->offset + el->length - off; - } - memcpy(buf, el->data + (off - el->offset), partial); - if (cv) { - tdb_convert(buf, len); - } - len -= partial; - off += partial; - buf = (void *)(partial + (char *)buf); - - if (len != 0 && transaction_read(tdb, off, buf, len, cv) != 0) { + if (len == 0) { + return 0; + } + + blk = off / tdb->transaction->block_size; + + /* see if we have it in the block list */ + if (tdb->transaction->num_blocks <= blk || + tdb->transaction->blocks[blk] == NULL) { + /* nope, do a real read */ + if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) { goto fail; } - return 0; } - /* its not in the transaction elements - do a real read */ - return tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv); + /* it is in the block list. Now check for the last block */ + if (blk == tdb->transaction->num_blocks-1) { + if (len > tdb->transaction->last_block_size) { + goto fail; + } + } + + /* now copy it out of this block */ + memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len); + if (cv) { + tdb_convert(buf, len); + } + return 0; fail: TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len)); @@ -195,12 +185,8 @@ fail: static int transaction_write(struct tdb_context *tdb, tdb_off_t off, const void *buf, tdb_len_t len) { - struct tdb_transaction_el *el, *best_el=NULL; + uint32_t blk; - if (len == 0) { - return 0; - } - /* if the write is to a hash head, then update the transaction hash heads */ if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP && @@ -209,110 +195,145 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off, memcpy(&tdb->transaction->hash_heads[chain], buf, len); } - /* first see if we can replace an existing entry */ - for (el=tdb->transaction->elements_last;el;el=el->prev) { - tdb_len_t partial; - - if (best_el == NULL && off == el->offset+el->length) { - best_el = el; - } - - if (off+len <= el->offset) { - continue; + /* break it up into block sized chunks */ + while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) { + tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size); + if (transaction_write(tdb, off, buf, len2) != 0) { + return -1; } - if (off >= el->offset + el->length) { - continue; + len -= len2; + off += len2; + if (buf != NULL) { + buf = (const void *)(len2 + (const char *)buf); } + } - /* an overlapping write - needs to be split into up to - 2 writes and a memcpy */ - if (off < el->offset) { - partial = el->offset - off; - if (transaction_write(tdb, off, buf, partial) != 0) { - goto fail; - } - len -= partial; - off += partial; - buf = (const void *)(partial + (const char *)buf); - } - if (off + len <= el->offset + el->length) { - partial = len; + if (len == 0) { + return 0; + } + + blk = off / tdb->transaction->block_size; + off = off % tdb->transaction->block_size; + + if (tdb->transaction->num_blocks <= blk) { + uint8_t **new_blocks; + /* expand the blocks array */ + if (tdb->transaction->blocks == NULL) { + new_blocks = (uint8_t **)malloc( + (blk+1)*sizeof(uint8_t *)); } else { - partial = el->offset + el->length - off; + new_blocks = (uint8_t **)realloc( + tdb->transaction->blocks, + (blk+1)*sizeof(uint8_t *)); } - memcpy(el->data + (off - el->offset), buf, partial); - len -= partial; - off += partial; - buf = (const void *)(partial + (const char *)buf); - - if (len != 0 && transaction_write(tdb, off, buf, len) != 0) { + if (new_blocks == NULL) { + tdb->ecode = TDB_ERR_OOM; goto fail; } - - return 0; + memset(&new_blocks[tdb->transaction->num_blocks], 0, + (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *)); + tdb->transaction->blocks = new_blocks; + tdb->transaction->num_blocks = blk+1; + tdb->transaction->last_block_size = 0; } - /* see if we can append the new entry to an existing entry */ - if (best_el && best_el->offset + best_el->length == off && - (off+len < tdb->transaction->old_map_size || - off > tdb->transaction->old_map_size)) { - unsigned char *data = best_el->data; - el = best_el; - el->data = (unsigned char *)realloc(el->data, - el->length + len); - if (el->data == NULL) { + /* allocate and fill a block? */ + if (tdb->transaction->blocks[blk] == NULL) { + tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1); + if (tdb->transaction->blocks[blk] == NULL) { tdb->ecode = TDB_ERR_OOM; tdb->transaction->transaction_error = 1; - el->data = data; - return -1; + return -1; } - if (buf) { - memcpy(el->data + el->length, buf, len); - } else { - memset(el->data + el->length, TDB_PAD_BYTE, len); + if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) { + tdb_len_t len2 = tdb->transaction->block_size; + if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) { + len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size); + } + if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size, + tdb->transaction->blocks[blk], + len2, 0) != 0) { + SAFE_FREE(tdb->transaction->blocks[blk]); + tdb->ecode = TDB_ERR_IO; + goto fail; + } + if (blk == tdb->transaction->num_blocks-1) { + tdb->transaction->last_block_size = len2; + } } - el->length += len; - return 0; - } - - /* add a new entry at the end of the list */ - el = (struct tdb_transaction_el *)malloc(sizeof(*el)); - if (el == NULL) { - tdb->ecode = TDB_ERR_OOM; - tdb->transaction->transaction_error = 1; - return -1; - } - el->next = NULL; - el->prev = tdb->transaction->elements_last; - el->offset = off; - el->length = len; - el->data = (unsigned char *)malloc(len); - if (el->data == NULL) { - free(el); - tdb->ecode = TDB_ERR_OOM; - tdb->transaction->transaction_error = 1; - return -1; } - if (buf) { - memcpy(el->data, buf, len); + + /* overwrite part of an existing block */ + if (buf == NULL) { + memset(tdb->transaction->blocks[blk] + off, 0, len); } else { - memset(el->data, TDB_PAD_BYTE, len); + memcpy(tdb->transaction->blocks[blk] + off, buf, len); } - if (el->prev) { - el->prev->next = el; - } else { - tdb->transaction->elements = el; + if (blk == tdb->transaction->num_blocks-1) { + if (len + off > tdb->transaction->last_block_size) { + tdb->transaction->last_block_size = len + off; + } } - tdb->transaction->elements_last = el; + return 0; fail: - TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n", off, len)); - tdb->ecode = TDB_ERR_IO; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n", + (blk*tdb->transaction->block_size) + off, len)); tdb->transaction->transaction_error = 1; return -1; } + +/* + write while in a transaction - this varient never expands the transaction blocks, it only + updates existing blocks. This means it cannot change the recovery size +*/ +static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off, + const void *buf, tdb_len_t len) +{ + uint32_t blk; + + /* break it up into block sized chunks */ + while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) { + tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size); + if (transaction_write_existing(tdb, off, buf, len2) != 0) { + return -1; + } + len -= len2; + off += len2; + if (buf != NULL) { + buf = (const void *)(len2 + (const char *)buf); + } + } + + if (len == 0) { + return 0; + } + + blk = off / tdb->transaction->block_size; + off = off % tdb->transaction->block_size; + + if (tdb->transaction->num_blocks <= blk || + tdb->transaction->blocks[blk] == NULL) { + return 0; + } + + if (blk == tdb->transaction->num_blocks-1 && + off + len > tdb->transaction->last_block_size) { + if (off >= tdb->transaction->last_block_size) { + return 0; + } + len = tdb->transaction->last_block_size - off; + } + + /* overwrite part of an existing block */ + memcpy(tdb->transaction->blocks[blk] + off, buf, len); + + return 0; +} + + /* accelerated hash chain head search, using the cached hash heads */ @@ -419,10 +440,14 @@ int tdb_transaction_start(struct tdb_context *tdb) return -1; } + /* a page at a time seems like a reasonable compromise between compactness and efficiency */ + tdb->transaction->block_size = tdb->page_size; + /* get the transaction write lock. This is a blocking lock. As discussed with Volker, there are a number of ways we could make this async, which we will probably do in the future */ if (tdb_transaction_lock(tdb, F_WRLCK) == -1) { + SAFE_FREE(tdb->transaction->blocks); SAFE_FREE(tdb->transaction); return -1; } @@ -460,21 +485,12 @@ int tdb_transaction_start(struct tdb_context *tdb) tdb->transaction->io_methods = tdb->methods; tdb->methods = &transaction_methods; - /* by calling this transaction write here, we ensure that we don't grow the - transaction linked list due to hash table updates */ - if (transaction_write(tdb, FREELIST_TOP, tdb->transaction->hash_heads, - TDB_HASHTABLE_SIZE(tdb)) != 0) { - TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to prime hash table\n")); - tdb->ecode = TDB_ERR_IO; - tdb->methods = tdb->transaction->io_methods; - goto fail; - } - return 0; fail: tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0); tdb_transaction_unlock(tdb); + SAFE_FREE(tdb->transaction->blocks); SAFE_FREE(tdb->transaction->hash_heads); SAFE_FREE(tdb->transaction); return -1; @@ -486,6 +502,8 @@ fail: */ int tdb_transaction_cancel(struct tdb_context *tdb) { + int i; + if (tdb->transaction == NULL) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n")); return -1; @@ -499,13 +517,13 @@ int tdb_transaction_cancel(struct tdb_context *tdb) tdb->map_size = tdb->transaction->old_map_size; - /* free all the transaction elements */ - while (tdb->transaction->elements) { - struct tdb_transaction_el *el = tdb->transaction->elements; - tdb->transaction->elements = el->next; - free(el->data); - free(el); + /* free all the transaction blocks */ + for (i=0;i<tdb->transaction->num_blocks;i++) { + if (tdb->transaction->blocks[i] != NULL) { + free(tdb->transaction->blocks[i]); + } } + SAFE_FREE(tdb->transaction->blocks); /* remove any global lock created during the transaction */ if (tdb->global_lock.count != 0) { @@ -515,7 +533,6 @@ int tdb_transaction_cancel(struct tdb_context *tdb) /* remove any locks created during the transaction */ if (tdb->num_locks != 0) { - int i; for (i=0;i<tdb->num_lockrecs;i++) { tdb_brlock(tdb,FREELIST_TOP+4*tdb->lockrecs[i].list, F_UNLCK,F_SETLKW, 0, 1); @@ -567,16 +584,24 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t */ static tdb_len_t tdb_recovery_size(struct tdb_context *tdb) { - struct tdb_transaction_el *el; tdb_len_t recovery_size = 0; + int i; recovery_size = sizeof(uint32_t); - for (el=tdb->transaction->elements;el;el=el->next) { - if (el->offset >= tdb->transaction->old_map_size) { + for (i=0;i<tdb->transaction->num_blocks;i++) { + if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) { + break; + } + if (tdb->transaction->blocks[i] == NULL) { continue; } - recovery_size += 2*sizeof(tdb_off_t) + el->length; - } + recovery_size += 2*sizeof(tdb_off_t); + if (i == tdb->transaction->num_blocks-1) { + recovery_size += tdb->transaction->last_block_size; + } else { + recovery_size += tdb->transaction->block_size; + } + } return recovery_size; } @@ -658,6 +683,10 @@ static int tdb_recovery_allocate(struct tdb_context *tdb, TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n")); return -1; } + if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n")); + return -1; + } return 0; } @@ -669,7 +698,6 @@ static int tdb_recovery_allocate(struct tdb_context *tdb, static int transaction_setup_recovery(struct tdb_context *tdb, tdb_off_t *magic_offset) { - struct tdb_transaction_el *el; tdb_len_t recovery_size; unsigned char *data, *p; const struct tdb_methods *methods = tdb->transaction->io_methods; @@ -677,6 +705,7 @@ static int transaction_setup_recovery(struct tdb_context *tdb, tdb_off_t recovery_offset, recovery_max_size; tdb_off_t old_map_size = tdb->transaction->old_map_size; uint32_t magic, tailer; + int i; /* check that the recovery area has enough space @@ -704,30 +733,43 @@ static int transaction_setup_recovery(struct tdb_context *tdb, /* build the recovery data into a single blob to allow us to do a single large write, which should be more efficient */ p = data + sizeof(*rec); - for (el=tdb->transaction->elements;el;el=el->next) { - if (el->offset >= old_map_size) { + for (i=0;i<tdb->transaction->num_blocks;i++) { + tdb_off_t offset; + tdb_len_t length; + + if (tdb->transaction->blocks[i] == NULL) { + continue; + } + + offset = i * tdb->transaction->block_size; + length = tdb->transaction->block_size; + if (i == tdb->transaction->num_blocks-1) { + length = tdb->transaction->last_block_size; + } + + if (offset >= old_map_size) { continue; } - if (el->offset + el->length > tdb->transaction->old_map_size) { + if (offset + length > tdb->transaction->old_map_size) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n")); free(data); tdb->ecode = TDB_ERR_CORRUPT; return -1; } - memcpy(p, &el->offset, 4); - memcpy(p+4, &el->length, 4); + memcpy(p, &offset, 4); + memcpy(p+4, &length, 4); if (DOCONV()) { tdb_convert(p, 8); } /* the recovery area contains the old data, not the new data, so we have to call the original tdb_read method to get it */ - if (methods->tdb_read(tdb, el->offset, p + 8, el->length, 0) != 0) { + if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) { free(data); tdb->ecode = TDB_ERR_IO; return -1; } - p += 8 + el->length; + p += 8 + length; } /* and the tailer */ @@ -742,6 +784,12 @@ static int transaction_setup_recovery(struct tdb_context *tdb, tdb->ecode = TDB_ERR_IO; return -1; } + if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n")); + free(data); + tdb->ecode = TDB_ERR_IO; + return -1; + } /* as we don't have ordered writes, we have to sync the recovery data before we update the magic to indicate that the recovery @@ -763,6 +811,11 @@ static int transaction_setup_recovery(struct tdb_context *tdb, tdb->ecode = TDB_ERR_IO; return -1; } + if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n")); + tdb->ecode = TDB_ERR_IO; + return -1; + } /* ensure the recovery magic marker is on disk */ if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) { @@ -780,6 +833,7 @@ int tdb_transaction_commit(struct tdb_context *tdb) const struct tdb_methods *methods; tdb_off_t magic_offset = 0; uint32_t zero = 0; + int i; if (tdb->transaction == NULL) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n")); @@ -793,13 +847,14 @@ int tdb_transaction_commit(struct tdb_context *tdb) return -1; } + if (tdb->transaction->nesting != 0) { tdb->transaction->nesting--; return 0; } /* check for a null transaction */ - if (tdb->transaction->elements == NULL) { + if (tdb->transaction->blocks == NULL) { tdb_transaction_cancel(tdb); return 0; } @@ -858,10 +913,21 @@ int tdb_transaction_commit(struct tdb_context *tdb) } /* perform all the writes */ - while (tdb->transaction->elements) { - struct tdb_transaction_el *el = tdb->transaction->elements; + for (i=0;i<tdb->transaction->num_blocks;i++) { + tdb_off_t offset; + tdb_len_t length; + + if (tdb->transaction->blocks[i] == NULL) { + continue; + } - if (methods->tdb_write(tdb, el->offset, el->data, el->length) == -1) { + offset = i * tdb->transaction->block_size; + length = tdb->transaction->block_size; + if (i == tdb->transaction->num_blocks-1) { + length = tdb->transaction->last_block_size; + } + + if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n")); /* we've overwritten part of the data and @@ -876,11 +942,12 @@ int tdb_transaction_commit(struct tdb_context *tdb) TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n")); return -1; } - tdb->transaction->elements = el->next; - free(el->data); - free(el); + SAFE_FREE(tdb->transaction->blocks[i]); } + SAFE_FREE(tdb->transaction->blocks); + tdb->transaction->num_blocks = 0; + if (!(tdb->flags & TDB_NOSYNC)) { /* ensure the new data is on disk */ if (transaction_sync(tdb, 0, tdb->map_size) == -1) { @@ -919,6 +986,7 @@ int tdb_transaction_commit(struct tdb_context *tdb) /* use a transaction cancel to free memory and remove the transaction locks */ tdb_transaction_cancel(tdb); + return 0; } diff --git a/source3/lib/tdb/common/traverse.c b/source3/lib/tdb/common/traverse.c index 27b2cfc54a..07b0c23858 100644 --- a/source3/lib/tdb/common/traverse.c +++ b/source3/lib/tdb/common/traverse.c @@ -241,7 +241,9 @@ int tdb_traverse(struct tdb_context *tdb, return -1; } + tdb->traverse_write++; ret = tdb_traverse_internal(tdb, fn, private_data, &tl); + tdb->traverse_write--; tdb_transaction_unlock(tdb); @@ -333,3 +335,4 @@ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey) TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n")); return key; } + |