1 files changed, 120 insertions, 51 deletions
diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index bb56b66bdc..bdcec3985d 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -33,6 +33,11 @@ struct db_ctdb_rec {
 	struct ctdb_ltdb_header header;
 };
 
+static struct db_record *fetch_locked_internal(struct db_ctdb_ctx *ctx,
+					       TALLOC_CTX *mem_ctx,
+					       TDB_DATA key,
+					       bool persistent);
+
 static NTSTATUS db_ctdb_store(struct db_record *rec, TDB_DATA data, int flag)
 {
 	struct db_ctdb_rec *crec = talloc_get_type_abort(
@@ -62,61 +67,116 @@ static NTSTATUS db_ctdb_store(struct db_record *rec, TDB_DATA data, int flag)
    store */
 static NTSTATUS db_ctdb_store_persistent(struct db_record *rec, TDB_DATA data, int flag)
 {
-	struct db_ctdb_rec *crec = talloc_get_type_abort(
-		rec->private_data, struct db_ctdb_rec);
+	struct db_ctdb_rec *crec;
+	struct db_record *record;
 	TDB_DATA cdata;
 	int ret;
 	NTSTATUS status;
+	uint32_t count;
+	int max_retries = lp_parm_int(-1, "dbwrap ctdb", "max store retries", 5);
+
+	for (count = 0, status = NT_STATUS_UNSUCCESSFUL, record = rec;
+	     (count < max_retries) && !NT_STATUS_IS_OK(status);
+	     count++)
+	{
+		if (count > 0) {
+			/* retry */
+			/*
+			 * There is a hack here: We use rec as a memory
+			 * context and re-use it as the record struct ptr.
+			 * We don't free the record data allocated
+			 * in each turn. So all gets freed when the caller
+			 * releases the original record. This is because
+			 * we don't get the record passed in by reference
+			 * in the first place and the caller relies on
+			 * having to free the record himself.
+			 */
+			record = fetch_locked_internal(crec->ctdb_ctx,
+						       rec,
+						       rec->key,
+						       true /* persistent */);
+			if (record == NULL) {
+				DEBUG(5, ("fetch_locked_internal failed.\n"));
+				status = NT_STATUS_NO_MEMORY;
+				break;
+			}
+		}
 
-	cdata.dsize = sizeof(crec->header) + data.dsize;
+		crec = talloc_get_type_abort(record->private_data,
+					     struct db_ctdb_rec);
 
-	if (!(cdata.dptr = SMB_MALLOC_ARRAY(uint8, cdata.dsize))) {
-		return NT_STATUS_NO_MEMORY;
-	}
+		cdata.dsize = sizeof(crec->header) + data.dsize;
 
-	crec->header.rsn++;
+		if (!(cdata.dptr = SMB_MALLOC_ARRAY(uint8, cdata.dsize))) {
+			return NT_STATUS_NO_MEMORY;
+		}
 
-	memcpy(cdata.dptr, &crec->header, sizeof(crec->header));
-	memcpy(cdata.dptr + sizeof(crec->header), data.dptr, data.dsize);
+		crec->header.rsn++;
 
-	status = ctdbd_start_persistent_update(messaging_ctdbd_connection(), crec->ctdb_ctx->db_id, rec->key, cdata);
-	
-	if (NT_STATUS_IS_OK(status)) {
-		ret = tdb_store(crec->ctdb_ctx->wtdb->tdb, rec->key, cdata, TDB_REPLACE);
-		status = (ret == 0) ? NT_STATUS_OK : NT_STATUS_INTERNAL_DB_CORRUPTION;
-	}
+		memcpy(cdata.dptr, &crec->header, sizeof(crec->header));
+		memcpy(cdata.dptr + sizeof(crec->header), data.dptr, data.dsize);
 
-	/*
-	 * release the lock *now* in order to prevent deadlocks.
-	 *
-	 * There is a tradeoff: Usually, the record is still locked
-	 * after db->store operation. This lock is usually released
-	 * via the talloc destructor with the TALLOC_FREE to
-	 * the record. So we have two choices:
-	 *
-	 * - Either re-lock the record after the call to persistent_store
-	 *   or cancel_persistent update and this way not changing any
-	 *   assumptions callers may have about the state, but possibly
-	 *   introducing new race conditions.
-	 *
-	 * - Or don't lock the record again but just remove the
-	 *   talloc_destructor. This is less racy but assumes that
-	 *   the lock is always released via TALLOC_FREE of the record.
-	 *
-	 * I choose the first variant for now since it seems less racy.
-	 * We can't guarantee that we succeed in getting the lock
-	 * anyways. The only real danger here is that a caller
-	 * performs multiple store operations after a fetch_locked()
-	 * which is currently not the case.
-	 */
-	tdb_chainunlock(crec->ctdb_ctx->wtdb->tdb, rec->key);
-	talloc_set_destructor(rec, NULL);
+		status = ctdbd_start_persistent_update(
+				messaging_ctdbd_connection(),
+				crec->ctdb_ctx->db_id,
+				rec->key,
+				cdata);
 
-	/* now tell ctdbd to update this record on all other nodes */
-	if (NT_STATUS_IS_OK(status)) {
-		status = ctdbd_persistent_store(messaging_ctdbd_connection(), crec->ctdb_ctx->db_id, rec->key, cdata);
-	} else {
-		ctdbd_cancel_persistent_update(messaging_ctdbd_connection(), crec->ctdb_ctx->db_id, rec->key, cdata);
+		if (NT_STATUS_IS_OK(status)) {
+			ret = tdb_store(crec->ctdb_ctx->wtdb->tdb, rec->key,
+					cdata, TDB_REPLACE);
+			status = (ret == 0) ? NT_STATUS_OK
+					    : NT_STATUS_INTERNAL_DB_CORRUPTION;
+		}
+
+		/*
+		 * release the lock *now* in order to prevent deadlocks.
+		 *
+		 * There is a tradeoff: Usually, the record is still locked
+		 * after db->store operation. This lock is usually released
+		 * via the talloc destructor with the TALLOC_FREE to
+		 * the record. So we have two choices:
+		 *
+		 * - Either re-lock the record after the call to persistent_store
+		 *   or cancel_persistent update and this way not changing any
+		 *   assumptions callers may have about the state, but possibly
+		 *   introducing new race conditions.
+		 *
+		 * - Or don't lock the record again but just remove the
+		 *   talloc_destructor. This is less racy but assumes that
+		 *   the lock is always released via TALLOC_FREE of the record.
+		 *
+		 * I choose the first variant for now since it seems less racy.
+		 * We can't guarantee that we succeed in getting the lock
+		 * anyways. The only real danger here is that a caller
+		 * performs multiple store operations after a fetch_locked()
+		 * which is currently not the case.
+		 */
+		tdb_chainunlock(crec->ctdb_ctx->wtdb->tdb, rec->key);
+		talloc_set_destructor(record, NULL);
+
+		/* now tell ctdbd to update this record on all other nodes */
+		if (NT_STATUS_IS_OK(status)) {
+			status = ctdbd_persistent_store(
+					messaging_ctdbd_connection(),
+					crec->ctdb_ctx->db_id,
+					rec->key,
+					cdata);
+		} else {
+			ctdbd_cancel_persistent_update(
+					messaging_ctdbd_connection(),
+					crec->ctdb_ctx->db_id,
+					rec->key,
+					cdata);
+		}
+
+		SAFE_FREE(cdata.dptr);
+	} /* retry-loop */
+
+	if (!NT_STATUS_IS_OK(status)) {
+		DEBUG(5, ("ctdbd_persistent_store still failed after "
+			  "%d retries with error %s - giving up.\n",
+			  count, nt_errstr(status)));
 	}
 
 	SAFE_FREE(cdata.dptr);
@@ -164,12 +224,11 @@ static int db_ctdb_record_destr(struct db_record* data)
 	return 0;
 }
 
-static struct db_record *db_ctdb_fetch_locked(struct db_context *db,
-					      TALLOC_CTX *mem_ctx,
-					      TDB_DATA key)
+static struct db_record *fetch_locked_internal(struct db_ctdb_ctx *ctx,
+					       TALLOC_CTX *mem_ctx,
+					       TDB_DATA key,
+					       bool persistent)
 {
-	struct db_ctdb_ctx *ctx = talloc_get_type_abort(db->private_data,
-							struct db_ctdb_ctx);
 	struct db_record *result;
 	struct db_ctdb_rec *crec;
 	NTSTATUS status;
@@ -218,7 +277,7 @@ again:
 		return NULL;
 	}
 
-	if (db->persistent) {
+	if (persistent) {
 		result->store = db_ctdb_store_persistent;
 	} else {
 		result->store = db_ctdb_store;
@@ -285,6 +344,16 @@ again:
 	return result;
 }
 
+static struct db_record *db_ctdb_fetch_locked(struct db_context *db,
+					      TALLOC_CTX *mem_ctx,
+					      TDB_DATA key)
+{
+	struct db_ctdb_ctx *ctx = talloc_get_type_abort(db->private_data,
+							struct db_ctdb_ctx);
+
+	return fetch_locked_internal(ctx, mem_ctx, key, db->persistent);
+}
+
 /*
   fetch (unlocked, no migration) operation on ctdb
  */