summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVolker Lendecke <vl@samba.org>2012-10-02 15:26:14 +0200
committerVolker Lendecke <vl@samba.org>2012-10-06 13:23:42 +0200
commitc62f8baff878001ead921112dd653ff69d1cfe7d (patch)
tree4d9c706d56eba1f53d3a2a80e004619d06d5963f
parent37fd93194db10fc832ed3fa1ec880ebc26be904b (diff)
downloadsamba-c62f8baff878001ead921112dd653ff69d1cfe7d.tar.gz
samba-c62f8baff878001ead921112dd653ff69d1cfe7d.tar.bz2
samba-c62f8baff878001ead921112dd653ff69d1cfe7d.zip
tdb: Make tdb robust against improper CLEAR_IF_FIRST restart
When winbind is restarted, there is a potential crash in tdb. Following situation: We are in a cluster with ctdb. A winbind child hangs in a request to the DC. Cluster monitoring decides the node has a problem. Cluster monitoring decides to kill ctdbd. winbind child still hangs in a RPC request. winbind parent figures that ctdb is dead and immediately commits suicide. winbind parent is restarted by cluster management, overwriting gencache.tdb with CLEAR_IF_FIRST. The CLEAR_IF_FIRST logic as implemented now will not see that a child still has the tdb open, only the parent holds the ACTIVE_LOCK due to performance reasons. During the CLEAR_IF_FIRST logic is done, there is a very small window where we ftruncate(tfd, 0) the file and re-write a proper header without a lock. When during this small window the winbind child comes back, wanting to store something into gencache.tdb, that winbind child will crash with a SIGBUS. Sounds unlikely? See: [2012/09/29 07:02:31.871607, 0] lib/util.c:1183(smb_panic) PANIC (pid 1814517): internal error [2012/09/29 07:02:31.877596, 0] lib/util.c:1287(log_stack_trace) BACKTRACE: 35 stack frames: #0 winbindd(log_stack_trace+0x1a) [0x7feb7d4ca18a] #1 winbindd(smb_panic+0x2b) [0x7feb7d4ca25b] #2 winbindd(+0x1a3cc4) [0x7feb7d4bacc4] #3 /lib64/libc.so.6(+0x32900) [0x7feb7a929900] #4 /lib64/libc.so.6(memcpy+0x35) [0x7feb7a97f355] #5 /usr/lib64/libtdb.so.1(+0x6e76) [0x7feb7b0b0e76] #6 /usr/lib64/libtdb.so.1(+0x3d37) [0x7feb7b0add37] #7 /usr/lib64/libtdb.so.1(+0x863d) [0x7feb7b0b263d] #8 /usr/lib64/libtdb.so.1(+0x8700) [0x7feb7b0b2700] #9 /usr/lib64/libtdb.so.1(+0x2505) [0x7feb7b0ac505] #10 /usr/lib64/libtdb.so.1(+0x25b7) [0x7feb7b0ac5b7] #11 /usr/lib64/libtdb.so.1(tdb_fetch+0x13) [0x7feb7b0ac633] #12 winbindd(gencache_set_data_blob+0x259) [0x7feb7d4d8449] #13 winbindd(gencache_set+0x53) [0x7feb7d4d85b3] #14 winbindd(gencache_del+0x5e) [0x7feb7d4d879e] #15 winbindd(saf_delete+0x93) [0x7feb7d54b693] #16 winbindd(+0xe507e) [0x7feb7d3fc07e] #17 winbindd(+0xe85e5) [0x7feb7d3ff5e5] #18 winbindd(+0xe65be) [0x7feb7d3fd5be] #19 winbindd(+0xe7562) [0x7feb7d3fe562] #20 winbindd(init_dc_connection+0x2e) [0x7feb7d3fe5be] #21 winbindd(+0xe75d9) [0x7feb7d3fe5d9] #22 winbindd(cm_connect_netlogon+0x58) [0x7feb7d3fe658] #23 winbindd(_wbint_PingDc+0x61) [0x7feb7d410991] #24 winbindd(+0x103175) [0x7feb7d41a175] #25 winbindd(winbindd_dual_ndrcmd+0xb7) [0x7feb7d4107d7] #26 winbindd(+0xf8609) [0x7feb7d40f609] #27 winbindd(+0xf9075) [0x7feb7d410075] #28 winbindd(tevent_common_loop_immediate+0xe8) [0x7feb7d4db198] #29 winbindd(run_events_poll+0x3c) [0x7feb7d4d93fc] #30 winbindd(+0x1c2b52) [0x7feb7d4d9b52] #31 winbindd(_tevent_loop_once+0x90) [0x7feb7d4d9f60] #32 winbindd(main+0x7b3) [0x7feb7d3e7aa3] #33 /lib64/libc.so.6(__libc_start_main+0xfd) [0x7feb7a915cdd] #34 winbindd(+0xce2a9) [0x7feb7d3e52a9] This is in a winbind child, logfiles surrounding indicate the parent was restarted. This patch takes all chain locks around the CLEAR_IF_FIRST introduced tdb_new_database.
-rw-r--r--lib/tdb/common/open.c32
1 files changed, 28 insertions, 4 deletions
diff --git a/lib/tdb/common/open.c b/lib/tdb/common/open.c
index 8836f8442e..d9f76f0835 100644
--- a/lib/tdb/common/open.c
+++ b/lib/tdb/common/open.c
@@ -313,12 +313,36 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
(!tdb->read_only) &&
(locked = (tdb_nest_lock(tdb, ACTIVE_LOCK, F_WRLCK, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE) == 0))) {
- open_flags |= O_CREAT;
- if (ftruncate(tdb->fd, 0) == -1) {
+ int ret;
+ ret = tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0,
+ TDB_LOCK_WAIT);
+ if (ret == -1) {
TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
- "failed to truncate %s: %s\n",
+ "tdb_brlock failed for %s: %s\n",
name, strerror(errno)));
- goto fail; /* errno set by ftruncate */
+ goto fail;
+ }
+ ret = tdb_new_database(tdb, hash_size);
+ if (ret == -1) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+ "tdb_new_database failed for %s: %s\n",
+ name, strerror(errno)));
+ tdb_unlockall(tdb);
+ goto fail;
+ }
+ ret = tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
+ if (ret == -1) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+ "tdb_unlockall failed for %s: %s\n",
+ name, strerror(errno)));
+ goto fail;
+ }
+ ret = lseek(tdb->fd, 0, SEEK_SET);
+ if (ret == -1) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+ "lseek failed for %s: %s\n",
+ name, strerror(errno)));
+ goto fail;
}
}