summaryrefslogtreecommitdiff
path: root/source4/cluster/ctdb/common
diff options
context:
space:
mode:
Diffstat (limited to 'source4/cluster/ctdb/common')
-rw-r--r--source4/cluster/ctdb/common/cmdline.c132
-rw-r--r--source4/cluster/ctdb/common/ctdb_io.c338
-rw-r--r--source4/cluster/ctdb/common/ctdb_ltdb.c178
-rw-r--r--source4/cluster/ctdb/common/ctdb_message.c112
-rw-r--r--source4/cluster/ctdb/common/ctdb_util.c285
-rw-r--r--source4/cluster/ctdb/common/system.c385
6 files changed, 1430 insertions, 0 deletions
diff --git a/source4/cluster/ctdb/common/cmdline.c b/source4/cluster/ctdb/common/cmdline.c
new file mode 100644
index 0000000000..df01110e8a
--- /dev/null
+++ b/source4/cluster/ctdb/common/cmdline.c
@@ -0,0 +1,132 @@
+/*
+ common commandline code to ctdb test tools
+
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "../include/ctdb.h"
+#include "../include/ctdb_private.h"
+
+/* Handle common command line options for ctdb test progs
+ */
+
+static struct {
+ const char *socketname;
+ int torture;
+ const char *events;
+} ctdb_cmdline = {
+ .socketname = CTDB_PATH,
+ .torture = 0,
+};
+
+enum {OPT_EVENTSYSTEM=1};
+
+static void ctdb_cmdline_callback(poptContext con,
+ enum poptCallbackReason reason,
+ const struct poptOption *opt,
+ const char *arg, const void *data)
+{
+ switch (opt->val) {
+ case OPT_EVENTSYSTEM:
+ event_set_default_backend(arg);
+ break;
+ }
+}
+
+
+struct poptOption popt_ctdb_cmdline[] = {
+ { NULL, 0, POPT_ARG_CALLBACK, (void *)ctdb_cmdline_callback },
+ { "socket", 0, POPT_ARG_STRING, &ctdb_cmdline.socketname, 0, "local socket name", "filename" },
+ { "debug", 'd', POPT_ARG_INT, &LogLevel, 0, "debug level"},
+ { "torture", 0, POPT_ARG_NONE, &ctdb_cmdline.torture, 0, "enable nastiness in library", NULL },
+ { "events", 0, POPT_ARG_STRING, NULL, OPT_EVENTSYSTEM, "event system", NULL },
+ { NULL }
+};
+
+
+/*
+ startup daemon side of ctdb according to command line options
+ */
+struct ctdb_context *ctdb_cmdline_init(struct event_context *ev)
+{
+ struct ctdb_context *ctdb;
+ int ret;
+
+ /* initialise ctdb */
+ ctdb = ctdb_init(ev);
+ if (ctdb == NULL) {
+ printf("Failed to init ctdb\n");
+ exit(1);
+ }
+
+ if (ctdb_cmdline.torture) {
+ ctdb_set_flags(ctdb, CTDB_FLAG_TORTURE);
+ }
+
+ /* tell ctdb the socket address */
+ ret = ctdb_set_socketname(ctdb, ctdb_cmdline.socketname);
+ if (ret == -1) {
+ printf("ctdb_set_socketname failed - %s\n", ctdb_errstr(ctdb));
+ exit(1);
+ }
+
+ return ctdb;
+}
+
+
+/*
+ startup a client only ctdb context
+ */
+struct ctdb_context *ctdb_cmdline_client(struct event_context *ev)
+{
+ struct ctdb_context *ctdb;
+ int ret;
+
+ /* initialise ctdb */
+ ctdb = ctdb_init(ev);
+ if (ctdb == NULL) {
+ fprintf(stderr, "Failed to init ctdb\n");
+ exit(1);
+ }
+
+ /* tell ctdb the socket address */
+ ret = ctdb_set_socketname(ctdb, ctdb_cmdline.socketname);
+ if (ret == -1) {
+ fprintf(stderr, "ctdb_set_socketname failed - %s\n", ctdb_errstr(ctdb));
+ exit(1);
+ }
+
+ ret = ctdb_socket_connect(ctdb);
+ if (ret != 0) {
+ fprintf(stderr, __location__ " Failed to connect to daemon\n");
+ talloc_free(ctdb);
+ return NULL;
+ }
+
+ /* get our vnn */
+ ctdb->vnn = ctdb_ctrl_getvnn(ctdb, timeval_zero(), CTDB_CURRENT_NODE);
+ if (ctdb->vnn == (uint32_t)-1) {
+ DEBUG(0,(__location__ " Failed to get ctdb vnn\n"));
+ talloc_free(ctdb);
+ return NULL;
+ }
+
+ return ctdb;
+}
diff --git a/source4/cluster/ctdb/common/ctdb_io.c b/source4/cluster/ctdb/common/ctdb_io.c
new file mode 100644
index 0000000000..ca9c635878
--- /dev/null
+++ b/source4/cluster/ctdb/common/ctdb_io.c
@@ -0,0 +1,338 @@
+/*
+ ctdb database library
+ Utility functions to read/write blobs of data from a file descriptor
+ and handle the case where we might need multiple read/writes to get all the
+ data.
+
+ Copyright (C) Andrew Tridgell 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/tdb/include/tdb.h"
+#include "lib/events/events.h"
+#include "lib/util/dlinklist.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "../include/ctdb.h"
+
+/* structures for packet queueing - see common/ctdb_io.c */
+struct ctdb_partial {
+ uint8_t *data;
+ uint32_t length;
+};
+
+struct ctdb_queue_pkt {
+ struct ctdb_queue_pkt *next, *prev;
+ uint8_t *data;
+ uint32_t length;
+ uint32_t full_length;
+};
+
+struct ctdb_queue {
+ struct ctdb_context *ctdb;
+ struct ctdb_partial partial; /* partial input packet */
+ struct ctdb_queue_pkt *out_queue;
+ struct fd_event *fde;
+ int fd;
+ size_t alignment;
+ void *private_data;
+ ctdb_queue_cb_fn_t callback;
+};
+
+
+
+/*
+ called when an incoming connection is readable
+*/
+static void queue_io_read(struct ctdb_queue *queue)
+{
+ int num_ready = 0;
+ ssize_t nread;
+ uint8_t *data, *data_base;
+
+ if (ioctl(queue->fd, FIONREAD, &num_ready) != 0) {
+ return;
+ }
+ if (num_ready == 0) {
+ /* the descriptor has been closed */
+ goto failed;
+ }
+
+
+ queue->partial.data = talloc_realloc(queue, queue->partial.data,
+ uint8_t,
+ num_ready + queue->partial.length);
+
+ if (queue->partial.data == NULL) {
+ DEBUG(0,("read error alloc failed for %u\n",
+ num_ready + queue->partial.length));
+ goto failed;
+ }
+
+ nread = read(queue->fd, queue->partial.data + queue->partial.length, num_ready);
+ if (nread <= 0) {
+ DEBUG(0,("read error nread=%d\n", (int)nread));
+ goto failed;
+ }
+
+
+ data = queue->partial.data;
+ nread += queue->partial.length;
+
+ queue->partial.data = NULL;
+ queue->partial.length = 0;
+
+ if (nread >= 4 && *(uint32_t *)data == nread) {
+ /* it is the responsibility of the incoming packet
+ function to free 'data' */
+ queue->callback(data, nread, queue->private_data);
+ return;
+ }
+
+ data_base = data;
+
+ while (nread >= 4 && *(uint32_t *)data <= nread) {
+ /* we have at least one packet */
+ uint8_t *d2;
+ uint32_t len;
+ len = *(uint32_t *)data;
+ if (len == 0) {
+ /* bad packet! treat as EOF */
+ DEBUG(0,("Invalid packet of length 0\n"));
+ goto failed;
+ }
+ d2 = (uint8_t *)talloc_memdup(queue, data, len);
+ if (d2 == NULL) {
+ DEBUG(0,("read error memdup failed for %u\n", len));
+ /* sigh */
+ goto failed;
+ }
+ queue->callback(d2, len, queue->private_data);
+ data += len;
+ nread -= len;
+ }
+
+ if (nread > 0) {
+ /* we have only part of a packet */
+ if (data_base == data) {
+ queue->partial.data = data;
+ queue->partial.length = nread;
+ } else {
+ queue->partial.data = (uint8_t *)talloc_memdup(queue, data, nread);
+ if (queue->partial.data == NULL) {
+ DEBUG(0,("read error memdup partial failed for %u\n",
+ (unsigned)nread));
+ goto failed;
+ }
+ queue->partial.length = nread;
+ talloc_free(data_base);
+ }
+ return;
+ }
+
+ talloc_free(data_base);
+ return;
+
+failed:
+ queue->callback(NULL, 0, queue->private_data);
+}
+
+
+/* used when an event triggers a dead queue */
+static void queue_dead(struct event_context *ev, struct timed_event *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_queue *queue = talloc_get_type(private_data, struct ctdb_queue);
+ queue->callback(NULL, 0, queue->private_data);
+}
+
+
+/*
+ called when an incoming connection is writeable
+*/
+static void queue_io_write(struct ctdb_queue *queue)
+{
+ while (queue->out_queue) {
+ struct ctdb_queue_pkt *pkt = queue->out_queue;
+ ssize_t n;
+ if (queue->ctdb->flags & CTDB_FLAG_TORTURE) {
+ n = write(queue->fd, pkt->data, 1);
+ } else {
+ n = write(queue->fd, pkt->data, pkt->length);
+ }
+
+ if (n == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
+ if (pkt->length != pkt->full_length) {
+ /* partial packet sent - we have to drop it */
+ DLIST_REMOVE(queue->out_queue, pkt);
+ talloc_free(pkt);
+ }
+ talloc_free(queue->fde);
+ queue->fde = NULL;
+ queue->fd = -1;
+ event_add_timed(queue->ctdb->ev, queue, timeval_zero(),
+ queue_dead, queue);
+ return;
+ }
+ if (n <= 0) return;
+
+ if (n != pkt->length) {
+ pkt->length -= n;
+ pkt->data += n;
+ return;
+ }
+
+ DLIST_REMOVE(queue->out_queue, pkt);
+ talloc_free(pkt);
+ }
+
+ EVENT_FD_NOT_WRITEABLE(queue->fde);
+}
+
+/*
+ called when an incoming connection is readable or writeable
+*/
+static void queue_io_handler(struct event_context *ev, struct fd_event *fde,
+ uint16_t flags, void *private_data)
+{
+ struct ctdb_queue *queue = talloc_get_type(private_data, struct ctdb_queue);
+
+ if (flags & EVENT_FD_READ) {
+ queue_io_read(queue);
+ } else {
+ queue_io_write(queue);
+ }
+}
+
+
+/*
+ queue a packet for sending
+*/
+int ctdb_queue_send(struct ctdb_queue *queue, uint8_t *data, uint32_t length)
+{
+ struct ctdb_queue_pkt *pkt;
+ uint32_t length2, full_length;
+
+ if (queue->alignment) {
+ /* enforce the length and alignment rules from the tcp packet allocator */
+ length2 = (length+(queue->alignment-1)) & ~(queue->alignment-1);
+ *(uint32_t *)data = length2;
+ } else {
+ length2 = length;
+ }
+
+ if (length2 != length) {
+ memset(data+length, 0, length2-length);
+ }
+
+ full_length = length2;
+
+ /* if the queue is empty then try an immediate write, avoiding
+ queue overhead. This relies on non-blocking sockets */
+ if (queue->out_queue == NULL && queue->fd != -1 &&
+ !(queue->ctdb->flags & CTDB_FLAG_TORTURE)) {
+ ssize_t n = write(queue->fd, data, length2);
+ if (n == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
+ talloc_free(queue->fde);
+ queue->fde = NULL;
+ queue->fd = -1;
+ event_add_timed(queue->ctdb->ev, queue, timeval_zero(),
+ queue_dead, queue);
+ /* yes, we report success, as the dead node is
+ handled via a separate event */
+ return 0;
+ }
+ if (n > 0) {
+ data += n;
+ length2 -= n;
+ }
+ if (length2 == 0) return 0;
+ }
+
+ pkt = talloc(queue, struct ctdb_queue_pkt);
+ CTDB_NO_MEMORY(queue->ctdb, pkt);
+
+ pkt->data = (uint8_t *)talloc_memdup(pkt, data, length2);
+ CTDB_NO_MEMORY(queue->ctdb, pkt->data);
+
+ pkt->length = length2;
+ pkt->full_length = full_length;
+
+ if (queue->out_queue == NULL && queue->fd != -1) {
+ EVENT_FD_WRITEABLE(queue->fde);
+ }
+
+ DLIST_ADD_END(queue->out_queue, pkt, struct ctdb_queue_pkt *);
+
+ return 0;
+}
+
+
+/*
+ setup the fd used by the queue
+ */
+int ctdb_queue_set_fd(struct ctdb_queue *queue, int fd)
+{
+ queue->fd = fd;
+ talloc_free(queue->fde);
+ queue->fde = NULL;
+
+ if (fd != -1) {
+ queue->fde = event_add_fd(queue->ctdb->ev, queue, fd, EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+ queue_io_handler, queue);
+ if (queue->fde == NULL) {
+ return -1;
+ }
+
+ if (queue->out_queue) {
+ EVENT_FD_WRITEABLE(queue->fde);
+ }
+ }
+
+ return 0;
+}
+
+
+
+/*
+ setup a packet queue on a socket
+ */
+struct ctdb_queue *ctdb_queue_setup(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx, int fd, int alignment,
+
+ ctdb_queue_cb_fn_t callback,
+ void *private_data)
+{
+ struct ctdb_queue *queue;
+
+ queue = talloc_zero(mem_ctx, struct ctdb_queue);
+ CTDB_NO_MEMORY_NULL(ctdb, queue);
+
+ queue->ctdb = ctdb;
+ queue->fd = fd;
+ queue->alignment = alignment;
+ queue->private_data = private_data;
+ queue->callback = callback;
+ if (fd != -1) {
+ if (ctdb_queue_set_fd(queue, fd) != 0) {
+ talloc_free(queue);
+ return NULL;
+ }
+ }
+
+ return queue;
+}
diff --git a/source4/cluster/ctdb/common/ctdb_ltdb.c b/source4/cluster/ctdb/common/ctdb_ltdb.c
new file mode 100644
index 0000000000..92adc4a12a
--- /dev/null
+++ b/source4/cluster/ctdb/common/ctdb_ltdb.c
@@ -0,0 +1,178 @@
+/*
+ ctdb ltdb code
+
+ Copyright (C) Andrew Tridgell 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "lib/tdb/include/tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+
+/*
+ find an attached ctdb_db handle given a name
+ */
+struct ctdb_db_context *ctdb_db_handle(struct ctdb_context *ctdb, const char *name)
+{
+ struct ctdb_db_context *tmp_db;
+ for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) {
+ if (strcmp(name, tmp_db->db_name) == 0) {
+ return tmp_db;
+ }
+ }
+ return NULL;
+}
+
+
+/*
+ return the lmaster given a key
+*/
+uint32_t ctdb_lmaster(struct ctdb_context *ctdb, const TDB_DATA *key)
+{
+ uint32_t idx, lmaster;
+
+ idx = ctdb_hash(key) % ctdb->vnn_map->size;
+ lmaster = ctdb->vnn_map->map[idx];
+
+ return lmaster;
+}
+
+
+/*
+ construct an initial header for a record with no ltdb header yet
+*/
+static void ltdb_initial_header(struct ctdb_db_context *ctdb_db,
+ TDB_DATA key,
+ struct ctdb_ltdb_header *header)
+{
+ header->rsn = 0;
+ /* initial dmaster is the lmaster */
+ header->dmaster = ctdb_lmaster(ctdb_db->ctdb, &key);
+ header->laccessor = header->dmaster;
+ header->lacount = 0;
+}
+
+
+/*
+ fetch a record from the ltdb, separating out the header information
+ and returning the body of the record. A valid (initial) header is
+ returned if the record is not present
+*/
+int ctdb_ltdb_fetch(struct ctdb_db_context *ctdb_db,
+ TDB_DATA key, struct ctdb_ltdb_header *header,
+ TALLOC_CTX *mem_ctx, TDB_DATA *data)
+{
+ TDB_DATA rec;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+ rec = tdb_fetch(ctdb_db->ltdb->tdb, key);
+ if (rec.dsize < sizeof(*header)) {
+ TDB_DATA d2;
+ /* return an initial header */
+ if (rec.dptr) free(rec.dptr);
+ if (ctdb->vnn_map == NULL) {
+ /* called from the client */
+ ZERO_STRUCTP(data);
+ header->dmaster = (uint32_t)-1;
+ return -1;
+ }
+ ltdb_initial_header(ctdb_db, key, header);
+ ZERO_STRUCT(d2);
+ if (data) {
+ *data = d2;
+ }
+ ctdb_ltdb_store(ctdb_db, key, header, d2);
+ return 0;
+ }
+
+ *header = *(struct ctdb_ltdb_header *)rec.dptr;
+
+ if (data) {
+ data->dsize = rec.dsize - sizeof(struct ctdb_ltdb_header);
+ data->dptr = (unsigned char *)talloc_memdup(mem_ctx,
+ sizeof(struct ctdb_ltdb_header)+rec.dptr,
+ data->dsize);
+ }
+
+ free(rec.dptr);
+ if (data) {
+ CTDB_NO_MEMORY(ctdb, data->dptr);
+ }
+
+ return 0;
+}
+
+
+/*
+ fetch a record from the ltdb, separating out the header information
+ and returning the body of the record. A valid (initial) header is
+ returned if the record is not present
+*/
+int ctdb_ltdb_store(struct ctdb_db_context *ctdb_db, TDB_DATA key,
+ struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ TDB_DATA rec;
+ int ret;
+
+ if (ctdb->flags & CTDB_FLAG_TORTURE) {
+ struct ctdb_ltdb_header *h2;
+ rec = tdb_fetch(ctdb_db->ltdb->tdb, key);
+ h2 = (struct ctdb_ltdb_header *)rec.dptr;
+ if (rec.dptr && rec.dsize >= sizeof(h2) && h2->rsn > header->rsn) {
+ DEBUG(0,("RSN regression! %llu %llu\n",
+ (unsigned long long)h2->rsn, (unsigned long long)header->rsn));
+ }
+ if (rec.dptr) free(rec.dptr);
+ }
+
+ rec.dsize = sizeof(*header) + data.dsize;
+ rec.dptr = (unsigned char *)talloc_size(ctdb, rec.dsize);
+ CTDB_NO_MEMORY(ctdb, rec.dptr);
+
+ memcpy(rec.dptr, header, sizeof(*header));
+ memcpy(rec.dptr + sizeof(*header), data.dptr, data.dsize);
+
+ ret = tdb_store(ctdb_db->ltdb->tdb, key, rec, TDB_REPLACE);
+ talloc_free(rec.dptr);
+
+ return ret;
+}
+
+
+/*
+ lock a record in the ltdb, given a key
+ */
+int ctdb_ltdb_lock(struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+ return tdb_chainlock(ctdb_db->ltdb->tdb, key);
+}
+
+/*
+ unlock a record in the ltdb, given a key
+ */
+int ctdb_ltdb_unlock(struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+ int ret = tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ if (ret != 0) {
+ DEBUG(0,("tdb_chainunlock failed\n"));
+ }
+ return ret;
+}
diff --git a/source4/cluster/ctdb/common/ctdb_message.c b/source4/cluster/ctdb/common/ctdb_message.c
new file mode 100644
index 0000000000..1aea28fd35
--- /dev/null
+++ b/source4/cluster/ctdb/common/ctdb_message.c
@@ -0,0 +1,112 @@
+/*
+ ctdb_message protocol code
+
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ see http://wiki.samba.org/index.php/Samba_%26_Clustering for
+ protocol design and packet details
+*/
+#include "includes.h"
+#include "lib/events/events.h"
+#include "lib/tdb/include/tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "lib/util/dlinklist.h"
+
+/*
+ this dispatches the messages to the registered ctdb message handler
+*/
+int ctdb_dispatch_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
+{
+ struct ctdb_message_list *ml;
+
+ for (ml=ctdb->message_list;ml;ml=ml->next) {
+ if (ml->srvid == srvid || ml->srvid == CTDB_SRVID_ALL) {
+ ml->message_handler(ctdb, srvid, data, ml->message_private);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ called when a CTDB_REQ_MESSAGE packet comes in
+*/
+void ctdb_request_message(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_req_message *c = (struct ctdb_req_message *)hdr;
+ TDB_DATA data;
+
+ data.dptr = &c->data[0];
+ data.dsize = c->datalen;
+
+ ctdb_dispatch_message(ctdb, c->srvid, data);
+}
+
+
+/*
+ when a client goes away, we need to remove its srvid handler from the list
+ */
+static int message_handler_destructor(struct ctdb_message_list *m)
+{
+ DLIST_REMOVE(m->ctdb->message_list, m);
+ return 0;
+}
+
+/*
+ setup handler for receipt of ctdb messages from ctdb_send_message()
+*/
+int ctdb_register_message_handler(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ uint64_t srvid,
+ ctdb_message_fn_t handler,
+ void *private_data)
+{
+ struct ctdb_message_list *m;
+
+ m = talloc(mem_ctx, struct ctdb_message_list);
+ CTDB_NO_MEMORY(ctdb, m);
+
+ m->ctdb = ctdb;
+ m->srvid = srvid;
+ m->message_handler = handler;
+ m->message_private = private_data;
+
+ DLIST_ADD(ctdb->message_list, m);
+
+ talloc_set_destructor(m, message_handler_destructor);
+
+ return 0;
+}
+
+
+/*
+ setup handler for receipt of ctdb messages from ctdb_send_message()
+*/
+int ctdb_deregister_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void *private_data)
+{
+ struct ctdb_message_list *m;
+
+ for (m=ctdb->message_list;m;m=m->next) {
+ if (m->srvid == srvid && m->message_private == private_data) {
+ talloc_free(m);
+ return 0;
+ }
+ }
+ return -1;
+}
diff --git a/source4/cluster/ctdb/common/ctdb_util.c b/source4/cluster/ctdb/common/ctdb_util.c
new file mode 100644
index 0000000000..f11388331d
--- /dev/null
+++ b/source4/cluster/ctdb/common/ctdb_util.c
@@ -0,0 +1,285 @@
+/*
+ ctdb utility code
+
+ Copyright (C) Andrew Tridgell 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "lib/tdb/include/tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+
+int LogLevel;
+
+/*
+ return error string for last error
+*/
+const char *ctdb_errstr(struct ctdb_context *ctdb)
+{
+ return ctdb->err_msg;
+}
+
+
+/*
+ remember an error message
+*/
+void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...)
+{
+ va_list ap;
+ talloc_free(ctdb->err_msg);
+ va_start(ap, fmt);
+ ctdb->err_msg = talloc_vasprintf(ctdb, fmt, ap);
+ DEBUG(0,("ctdb error: %s\n", ctdb->err_msg));
+ va_end(ap);
+}
+
+/*
+ a fatal internal error occurred - no hope for recovery
+*/
+_NORETURN_ void ctdb_fatal(struct ctdb_context *ctdb, const char *msg)
+{
+ DEBUG(0,("ctdb fatal error: %s\n", msg));
+ abort();
+}
+
+/*
+ parse a IP:port pair
+*/
+int ctdb_parse_address(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx, const char *str,
+ struct ctdb_address *address)
+{
+ struct servent *se;
+
+ setservent(0);
+ se = getservbyname("ctdb", "tcp");
+ endservent();
+
+ address->address = talloc_strdup(mem_ctx, str);
+ if (se == NULL) {
+ address->port = CTDB_PORT;
+ } else {
+ address->port = ntohs(se->s_port);
+ }
+ return 0;
+}
+
+
+/*
+ check if two addresses are the same
+*/
+bool ctdb_same_address(struct ctdb_address *a1, struct ctdb_address *a2)
+{
+ return strcmp(a1->address, a2->address) == 0 && a1->port == a2->port;
+}
+
+
+/*
+ hash function for mapping data to a VNN - taken from tdb
+*/
+uint32_t ctdb_hash(const TDB_DATA *key)
+{
+ uint32_t value; /* Used to compute the hash value. */
+ uint32_t i; /* Used to cycle through random values. */
+
+ /* Set the initial value from the key size. */
+ for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
+ value = (value + (key->dptr[i] << (i*5 % 24)));
+
+ return (1103515243 * value + 12345);
+}
+
+/*
+ a type checking varient of idr_find
+ */
+static void *_idr_find_type(struct idr_context *idp, int id, const char *type, const char *location)
+{
+ void *p = idr_find(idp, id);
+ if (p && talloc_check_name(p, type) == NULL) {
+ DEBUG(0,("%s idr_find_type expected type %s but got %s\n",
+ location, type, talloc_get_name(p)));
+ return NULL;
+ }
+ return p;
+}
+
+
+/*
+ update a max latency number
+ */
+void ctdb_latency(double *latency, struct timeval t)
+{
+ double l = timeval_elapsed(&t);
+ if (l > *latency) {
+ *latency = l;
+ }
+}
+
+uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state)
+{
+ uint32_t id;
+
+ id = ctdb->idr_cnt++ & 0xFFFF;
+ id |= (idr_get_new(ctdb->idr, state, 0xFFFF)<<16);
+ return id;
+}
+
+void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *type, const char *location)
+{
+ void *p;
+
+ p = _idr_find_type(ctdb->idr, (reqid>>16)&0xFFFF, type, location);
+ if (p == NULL) {
+ DEBUG(0, ("Could not find idr:%u\n",reqid));
+ }
+
+ return p;
+}
+
+
+void ctdb_reqid_remove(struct ctdb_context *ctdb, uint32_t reqid)
+{
+ int ret;
+
+ ret = idr_remove(ctdb->idr, (reqid>>16)&0xFFFF);
+ if (ret != 0) {
+ DEBUG(0, ("Removing idr that does not exist\n"));
+ }
+}
+
+
+/*
+ form a ctdb_rec_data record from a key/data pair
+ */
+struct ctdb_rec_data *ctdb_marshall_record(TALLOC_CTX *mem_ctx, uint32_t reqid, TDB_DATA key, TDB_DATA data)
+{
+ size_t length;
+ struct ctdb_rec_data *d;
+
+ length = offsetof(struct ctdb_rec_data, data) + key.dsize + data.dsize;
+ d = (struct ctdb_rec_data *)talloc_size(mem_ctx, length);
+ if (d == NULL) {
+ return NULL;
+ }
+ d->length = length;
+ d->reqid = reqid;
+ d->keylen = key.dsize;
+ d->datalen = data.dsize;
+ memcpy(&d->data[0], key.dptr, key.dsize);
+ memcpy(&d->data[key.dsize], data.dptr, data.dsize);
+ return d;
+}
+
+#if HAVE_SCHED_H
+#include <sched.h>
+#endif
+
+/*
+ if possible, make this task real time
+ */
+void ctdb_set_scheduler(struct ctdb_context *ctdb)
+{
+#if HAVE_SCHED_SETSCHEDULER
+ struct sched_param p;
+ if (ctdb->saved_scheduler_param == NULL) {
+ ctdb->saved_scheduler_param = talloc_size(ctdb, sizeof(p));
+ }
+
+ if (sched_getparam(0, (struct sched_param *)ctdb->saved_scheduler_param) == -1) {
+ DEBUG(0,("Unable to get old scheduler params\n"));
+ return;
+ }
+
+ p = *(struct sched_param *)ctdb->saved_scheduler_param;
+ p.sched_priority = 1;
+
+ if (sched_setscheduler(0, SCHED_FIFO, &p) == -1) {
+ DEBUG(0,("Unable to set scheduler to SCHED_FIFO (%s)\n",
+ strerror(errno)));
+ } else {
+ DEBUG(0,("Set scheduler to SCHED_FIFO\n"));
+ }
+#endif
+}
+
+/*
+ restore previous scheduler parameters
+ */
+void ctdb_restore_scheduler(struct ctdb_context *ctdb)
+{
+#if HAVE_SCHED_SETSCHEDULER
+ if (ctdb->saved_scheduler_param == NULL) {
+ ctdb_fatal(ctdb, "No saved scheduler parameters\n");
+ }
+ if (sched_setscheduler(0, SCHED_OTHER, (struct sched_param *)ctdb->saved_scheduler_param) == -1) {
+ ctdb_fatal(ctdb, "Unable to restore old scheduler parameters\n");
+ }
+#endif
+}
+
+void set_nonblocking(int fd)
+{
+ unsigned v;
+ v = fcntl(fd, F_GETFL, 0);
+ fcntl(fd, F_SETFL, v | O_NONBLOCK);
+}
+
+void set_close_on_exec(int fd)
+{
+ unsigned v;
+ v = fcntl(fd, F_GETFD, 0);
+ fcntl(fd, F_SETFD, v | FD_CLOEXEC);
+}
+
+
+/*
+ parse a ip:port pair
+ */
+bool parse_ip_port(const char *s, struct sockaddr_in *ip)
+{
+ const char *p;
+ char *endp = NULL;
+ unsigned port;
+ char buf[16];
+
+ ip->sin_family = AF_INET;
+
+ p = strchr(s, ':');
+ if (p == NULL) {
+ return false;
+ }
+
+ if (p - s > 15) {
+ return false;
+ }
+
+ port = strtoul(p+1, &endp, 10);
+ if (endp == NULL || *endp != 0) {
+ /* trailing garbage */
+ return false;
+ }
+ ip->sin_port = htons(port);
+
+ strlcpy(buf, s, 1+p-s);
+
+ if (inet_aton(buf, &ip->sin_addr) == 0) {
+ return false;
+ }
+
+ return true;
+}
diff --git a/source4/cluster/ctdb/common/system.c b/source4/cluster/ctdb/common/system.c
new file mode 100644
index 0000000000..1e536f5e8a
--- /dev/null
+++ b/source4/cluster/ctdb/common/system.c
@@ -0,0 +1,385 @@
+/*
+ ctdb recovery code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include "lib/events/events.h"
+#include <net/ethernet.h>
+#include <net/if_arp.h>
+
+
+
+/*
+ send gratuitous arp reply after we have taken over an ip address
+
+ saddr is the address we are trying to claim
+ iface is the interface name we will be using to claim the address
+ */
+int ctdb_sys_send_arp(const struct sockaddr_in *saddr, const char *iface)
+{
+ int s, ret;
+ struct sockaddr sa;
+ struct ether_header *eh;
+ struct arphdr *ah;
+ struct ifreq if_hwaddr;
+ unsigned char buffer[64]; /*minimum eth frame size */
+ char *ptr;
+
+ /* for now, we only handle AF_INET addresses */
+ if (saddr->sin_family != AF_INET) {
+ DEBUG(0,(__location__ " not an ipv4 address (family is %u)\n", saddr->sin_family));
+ return -1;
+ }
+
+ s = socket(AF_INET, SOCK_PACKET, htons(ETHERTYPE_ARP));
+ if (s == -1){
+ DEBUG(0,(__location__ " failed to open raw socket\n"));
+ return -1;
+ }
+
+ /* get the mac address */
+ strcpy(if_hwaddr.ifr_name, iface);
+ ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
+ if ( ret < 0 ) {
+ close(s);
+ DEBUG(0,(__location__ " ioctl failed\n"));
+ return -1;
+ }
+ if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
+ DEBUG(3,("Ignoring loopback arp request\n"));
+ close(s);
+ return 0;
+ }
+ if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
+ close(s);
+ errno = EINVAL;
+ DEBUG(0,(__location__ " not an ethernet address family (0x%x)\n",
+ if_hwaddr.ifr_hwaddr.sa_family));
+ return -1;
+ }
+
+
+ memset(buffer, 0 , 64);
+ eh = (struct ether_header *)buffer;
+ memset(eh->ether_dhost, 0xff, ETH_ALEN);
+ memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+ eh->ether_type = htons(ETHERTYPE_ARP);
+
+ ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
+ ah->ar_hrd = htons(ARPHRD_ETHER);
+ ah->ar_pro = htons(ETH_P_IP);
+ ah->ar_hln = ETH_ALEN;
+ ah->ar_pln = 4;
+
+ /* send a gratious arp */
+ ah->ar_op = htons(ARPOP_REQUEST);
+ ptr = (char *)&ah[1];
+ memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+ ptr+=ETH_ALEN;
+ memcpy(ptr, &saddr->sin_addr, 4);
+ ptr+=4;
+ memset(ptr, 0, ETH_ALEN);
+ ptr+=ETH_ALEN;
+ memcpy(ptr, &saddr->sin_addr, 4);
+ ptr+=4;
+
+ strncpy(sa.sa_data, iface, sizeof(sa.sa_data));
+ ret = sendto(s, buffer, 64, 0, &sa, sizeof(sa));
+ if (ret < 0 ){
+ close(s);
+ DEBUG(0,(__location__ " failed sendto\n"));
+ return -1;
+ }
+
+ /* send unsolicited arp reply broadcast */
+ ah->ar_op = htons(ARPOP_REPLY);
+ ptr = (char *)&ah[1];
+ memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+ ptr+=ETH_ALEN;
+ memcpy(ptr, &saddr->sin_addr, 4);
+ ptr+=4;
+ memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+ ptr+=ETH_ALEN;
+ memcpy(ptr, &saddr->sin_addr, 4);
+ ptr+=4;
+
+ strncpy(sa.sa_data, iface, sizeof(sa.sa_data));
+ ret = sendto(s, buffer, 64, 0, &sa, sizeof(sa));
+ if (ret < 0 ){
+ DEBUG(0,(__location__ " failed sendto\n"));
+ return -1;
+ }
+
+ close(s);
+ return 0;
+}
+
+
+/*
+ uint16 checksum for n bytes
+ */
+static uint32_t uint16_checksum(uint16_t *data, size_t n)
+{
+ uint32_t sum=0;
+ while (n>=2) {
+ sum += (uint32_t)ntohs(*data);
+ data++;
+ n -= 2;
+ }
+ if (n == 1) {
+ sum += (uint32_t)ntohs(*(uint8_t *)data);
+ }
+ return sum;
+}
+
+/*
+ simple TCP checksum - assumes data is multiple of 2 bytes long
+ */
+static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
+{
+ uint32_t sum = uint16_checksum(data, n);
+ uint16_t sum2;
+ sum += uint16_checksum((uint16_t *)&ip->saddr, sizeof(ip->saddr));
+ sum += uint16_checksum((uint16_t *)&ip->daddr, sizeof(ip->daddr));
+ sum += ip->protocol + n;
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ sum2 = htons(sum);
+ sum2 = ~sum2;
+ if (sum2 == 0) {
+ return 0xFFFF;
+ }
+ return sum2;
+}
+
+/*
+ Send tcp segment from the specified IP/port to the specified
+ destination IP/port.
+
+ This is used to trigger the receiving host into sending its own ACK,
+ which should trigger early detection of TCP reset by the client
+ after IP takeover
+
+ This can also be used to send RST segments (if rst is true) and also
+ if correct seq and ack numbers are provided.
+ */
+int ctdb_sys_send_tcp(const struct sockaddr_in *dest,
+ const struct sockaddr_in *src,
+ uint32_t seq, uint32_t ack, int rst)
+{
+ int s, ret;
+ uint32_t one = 1;
+ struct {
+ struct iphdr ip;
+ struct tcphdr tcp;
+ } pkt;
+
+ /* for now, we only handle AF_INET addresses */
+ if (src->sin_family != AF_INET || dest->sin_family != AF_INET) {
+ DEBUG(0,(__location__ " not an ipv4 address\n"));
+ return -1;
+ }
+
+ s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+ if (s == -1) {
+ DEBUG(0,(__location__ " failed to open raw socket (%s)\n",
+ strerror(errno)));
+ return -1;
+ }
+
+ ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
+ if (ret != 0) {
+ DEBUG(0,(__location__ " failed to setup IP headers (%s)\n",
+ strerror(errno)));
+ close(s);
+ return -1;
+ }
+
+ ZERO_STRUCT(pkt);
+ pkt.ip.version = 4;
+ pkt.ip.ihl = sizeof(pkt.ip)/4;
+ pkt.ip.tot_len = htons(sizeof(pkt));
+ pkt.ip.ttl = 255;
+ pkt.ip.protocol = IPPROTO_TCP;
+ pkt.ip.saddr = src->sin_addr.s_addr;
+ pkt.ip.daddr = dest->sin_addr.s_addr;
+ pkt.ip.check = 0;
+
+ pkt.tcp.source = src->sin_port;
+ pkt.tcp.dest = dest->sin_port;
+ pkt.tcp.seq = seq;
+ pkt.tcp.ack_seq = ack;
+ pkt.tcp.ack = 1;
+ if (rst) {
+ pkt.tcp.rst = 1;
+ }
+ pkt.tcp.doff = sizeof(pkt.tcp)/4;
+ pkt.tcp.window = htons(1234);
+ pkt.tcp.check = tcp_checksum((uint16_t *)&pkt.tcp, sizeof(pkt.tcp), &pkt.ip);
+
+ ret = sendto(s, &pkt, sizeof(pkt), 0, dest, sizeof(*dest));
+ if (ret != sizeof(pkt)) {
+ DEBUG(0,(__location__ " failed sendto (%s)\n", strerror(errno)));
+ close(s);
+ return -1;
+ }
+
+ close(s);
+ return 0;
+}
+
+
+/*
+ see if we currently have an interface with the given IP
+
+ we try to bind to it, and if that fails then we don't have that IP
+ on an interface
+ */
+bool ctdb_sys_have_ip(const char *ip)
+{
+ struct sockaddr_in sin;
+ int s;
+ int ret;
+
+ sin.sin_port = 0;
+ inet_aton(ip, &sin.sin_addr);
+ sin.sin_family = AF_INET;
+ s = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (s == -1) {
+ return false;
+ }
+ ret = bind(s, (struct sockaddr *)&sin, sizeof(sin));
+ close(s);
+ return ret == 0;
+}
+
+static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
+ struct timeval yt, void *p)
+{
+ uint32_t *timed_out = (uint32_t *)p;
+ (*timed_out) = 1;
+}
+
+/* This function is used to kill (RST) the specified tcp connection.
+
+ This function is not asynchronous and will block until the operation
+ was successful or it timesout.
+ */
+int ctdb_sys_kill_tcp(struct event_context *ev,
+ const struct sockaddr_in *dst,
+ const struct sockaddr_in *src)
+{
+ int s, ret;
+ uint32_t timedout;
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+#define RCVPKTSIZE 100
+ char pkt[RCVPKTSIZE];
+ struct ether_header *eth;
+ struct iphdr *ip;
+ struct tcphdr *tcp;
+
+ /* Open a socket to capture all traffic */
+ s=socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+ if (s == -1){
+ DEBUG(0,(__location__ " failed to open raw socket\n"));
+ return -1;
+ }
+
+ /* We wait for up to 1 second for the ACK coming back */
+ timedout = 0;
+ event_add_timed(ev, tmp_ctx, timeval_current_ofs(1, 0), ctdb_wait_handler, &timedout);
+
+ /* Send a tickle ack to probe what the real seq/ack numbers are */
+ ctdb_sys_send_tcp(dst, src, 0, 0, 0);
+
+ /* Wait until we either time out or we succeeds in sending the RST */
+ while (timedout==0) {
+ event_loop_once(ev);
+
+ ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
+ if (ret < sizeof(*eth)+sizeof(*ip)) {
+ continue;
+ }
+
+ /* Ethernet */
+ eth = (struct ether_header *)pkt;
+ /* We only want IP packets */
+ if (ntohs(eth->ether_type) != ETHERTYPE_IP) {
+ continue;
+ }
+
+ /* IP */
+ ip = (struct iphdr *)(eth+1);
+ /* We only want IPv4 packets */
+ if (ip->version != 4) {
+ continue;
+ }
+ /* Dont look at fragments */
+ if ((ntohs(ip->frag_off)&0x1fff) != 0) {
+ continue;
+ }
+ /* we only want TCP */
+ if (ip->protocol != IPPROTO_TCP) {
+ continue;
+ }
+
+ /* We only want packets sent from the guy we tickled */
+ if (ip->saddr != dst->sin_addr.s_addr) {
+ continue;
+ }
+ /* We only want packets sent to us */
+ if (ip->daddr != src->sin_addr.s_addr) {
+ continue;
+ }
+
+ /* make sure its not a short packet */
+ if (offsetof(struct tcphdr, ack_seq) + 4 +
+ (ip->ihl*4) + sizeof(*eth) > ret) {
+ continue;
+ }
+
+ /* TCP */
+ tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
+
+ /* We only want replies from the port we tickled */
+ if (tcp->source != dst->sin_port) {
+ continue;
+ }
+ if (tcp->dest != src->sin_port) {
+ continue;
+ }
+
+ ctdb_sys_send_tcp(dst, src, tcp->ack_seq, tcp->seq, 1);
+
+ close(s);
+ talloc_free(tmp_ctx);
+
+ return 0;
+ }
+
+ close(s);
+ talloc_free(tmp_ctx);
+ DEBUG(0,(__location__ " timedout waiting for tickle ack reply\n"));
+
+ return -1;
+}