diff options
Diffstat (limited to 'source4/cluster/ctdb/common')
-rw-r--r-- | source4/cluster/ctdb/common/cmdline.c | 132 | ||||
-rw-r--r-- | source4/cluster/ctdb/common/ctdb_io.c | 338 | ||||
-rw-r--r-- | source4/cluster/ctdb/common/ctdb_ltdb.c | 178 | ||||
-rw-r--r-- | source4/cluster/ctdb/common/ctdb_message.c | 112 | ||||
-rw-r--r-- | source4/cluster/ctdb/common/ctdb_util.c | 285 | ||||
-rw-r--r-- | source4/cluster/ctdb/common/system.c | 385 |
6 files changed, 1430 insertions, 0 deletions
diff --git a/source4/cluster/ctdb/common/cmdline.c b/source4/cluster/ctdb/common/cmdline.c new file mode 100644 index 0000000000..df01110e8a --- /dev/null +++ b/source4/cluster/ctdb/common/cmdline.c @@ -0,0 +1,132 @@ +/* + common commandline code to ctdb test tools + + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "system/filesys.h" +#include "popt.h" +#include "../include/ctdb.h" +#include "../include/ctdb_private.h" + +/* Handle common command line options for ctdb test progs + */ + +static struct { + const char *socketname; + int torture; + const char *events; +} ctdb_cmdline = { + .socketname = CTDB_PATH, + .torture = 0, +}; + +enum {OPT_EVENTSYSTEM=1}; + +static void ctdb_cmdline_callback(poptContext con, + enum poptCallbackReason reason, + const struct poptOption *opt, + const char *arg, const void *data) +{ + switch (opt->val) { + case OPT_EVENTSYSTEM: + event_set_default_backend(arg); + break; + } +} + + +struct poptOption popt_ctdb_cmdline[] = { + { NULL, 0, POPT_ARG_CALLBACK, (void *)ctdb_cmdline_callback }, + { "socket", 0, POPT_ARG_STRING, &ctdb_cmdline.socketname, 0, "local socket name", "filename" }, + { "debug", 'd', POPT_ARG_INT, &LogLevel, 0, "debug level"}, + { "torture", 0, POPT_ARG_NONE, &ctdb_cmdline.torture, 0, "enable nastiness in library", NULL }, + { "events", 0, POPT_ARG_STRING, NULL, OPT_EVENTSYSTEM, "event system", NULL }, + { NULL } +}; + + +/* + startup daemon side of ctdb according to command line options + */ +struct ctdb_context *ctdb_cmdline_init(struct event_context *ev) +{ + struct ctdb_context *ctdb; + int ret; + + /* initialise ctdb */ + ctdb = ctdb_init(ev); + if (ctdb == NULL) { + printf("Failed to init ctdb\n"); + exit(1); + } + + if (ctdb_cmdline.torture) { + ctdb_set_flags(ctdb, CTDB_FLAG_TORTURE); + } + + /* tell ctdb the socket address */ + ret = ctdb_set_socketname(ctdb, ctdb_cmdline.socketname); + if (ret == -1) { + printf("ctdb_set_socketname failed - %s\n", ctdb_errstr(ctdb)); + exit(1); + } + + return ctdb; +} + + +/* + startup a client only ctdb context + */ +struct ctdb_context *ctdb_cmdline_client(struct event_context *ev) +{ + struct ctdb_context *ctdb; + int ret; + + /* initialise ctdb */ + ctdb = ctdb_init(ev); + if (ctdb == NULL) { + fprintf(stderr, "Failed to init ctdb\n"); + exit(1); + } + + /* tell ctdb the socket address */ + ret = ctdb_set_socketname(ctdb, ctdb_cmdline.socketname); + if (ret == -1) { + fprintf(stderr, "ctdb_set_socketname failed - %s\n", ctdb_errstr(ctdb)); + exit(1); + } + + ret = ctdb_socket_connect(ctdb); + if (ret != 0) { + fprintf(stderr, __location__ " Failed to connect to daemon\n"); + talloc_free(ctdb); + return NULL; + } + + /* get our vnn */ + ctdb->vnn = ctdb_ctrl_getvnn(ctdb, timeval_zero(), CTDB_CURRENT_NODE); + if (ctdb->vnn == (uint32_t)-1) { + DEBUG(0,(__location__ " Failed to get ctdb vnn\n")); + talloc_free(ctdb); + return NULL; + } + + return ctdb; +} diff --git a/source4/cluster/ctdb/common/ctdb_io.c b/source4/cluster/ctdb/common/ctdb_io.c new file mode 100644 index 0000000000..ca9c635878 --- /dev/null +++ b/source4/cluster/ctdb/common/ctdb_io.c @@ -0,0 +1,338 @@ +/* + ctdb database library + Utility functions to read/write blobs of data from a file descriptor + and handle the case where we might need multiple read/writes to get all the + data. + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "lib/tdb/include/tdb.h" +#include "lib/events/events.h" +#include "lib/util/dlinklist.h" +#include "system/network.h" +#include "system/filesys.h" +#include "../include/ctdb_private.h" +#include "../include/ctdb.h" + +/* structures for packet queueing - see common/ctdb_io.c */ +struct ctdb_partial { + uint8_t *data; + uint32_t length; +}; + +struct ctdb_queue_pkt { + struct ctdb_queue_pkt *next, *prev; + uint8_t *data; + uint32_t length; + uint32_t full_length; +}; + +struct ctdb_queue { + struct ctdb_context *ctdb; + struct ctdb_partial partial; /* partial input packet */ + struct ctdb_queue_pkt *out_queue; + struct fd_event *fde; + int fd; + size_t alignment; + void *private_data; + ctdb_queue_cb_fn_t callback; +}; + + + +/* + called when an incoming connection is readable +*/ +static void queue_io_read(struct ctdb_queue *queue) +{ + int num_ready = 0; + ssize_t nread; + uint8_t *data, *data_base; + + if (ioctl(queue->fd, FIONREAD, &num_ready) != 0) { + return; + } + if (num_ready == 0) { + /* the descriptor has been closed */ + goto failed; + } + + + queue->partial.data = talloc_realloc(queue, queue->partial.data, + uint8_t, + num_ready + queue->partial.length); + + if (queue->partial.data == NULL) { + DEBUG(0,("read error alloc failed for %u\n", + num_ready + queue->partial.length)); + goto failed; + } + + nread = read(queue->fd, queue->partial.data + queue->partial.length, num_ready); + if (nread <= 0) { + DEBUG(0,("read error nread=%d\n", (int)nread)); + goto failed; + } + + + data = queue->partial.data; + nread += queue->partial.length; + + queue->partial.data = NULL; + queue->partial.length = 0; + + if (nread >= 4 && *(uint32_t *)data == nread) { + /* it is the responsibility of the incoming packet + function to free 'data' */ + queue->callback(data, nread, queue->private_data); + return; + } + + data_base = data; + + while (nread >= 4 && *(uint32_t *)data <= nread) { + /* we have at least one packet */ + uint8_t *d2; + uint32_t len; + len = *(uint32_t *)data; + if (len == 0) { + /* bad packet! treat as EOF */ + DEBUG(0,("Invalid packet of length 0\n")); + goto failed; + } + d2 = (uint8_t *)talloc_memdup(queue, data, len); + if (d2 == NULL) { + DEBUG(0,("read error memdup failed for %u\n", len)); + /* sigh */ + goto failed; + } + queue->callback(d2, len, queue->private_data); + data += len; + nread -= len; + } + + if (nread > 0) { + /* we have only part of a packet */ + if (data_base == data) { + queue->partial.data = data; + queue->partial.length = nread; + } else { + queue->partial.data = (uint8_t *)talloc_memdup(queue, data, nread); + if (queue->partial.data == NULL) { + DEBUG(0,("read error memdup partial failed for %u\n", + (unsigned)nread)); + goto failed; + } + queue->partial.length = nread; + talloc_free(data_base); + } + return; + } + + talloc_free(data_base); + return; + +failed: + queue->callback(NULL, 0, queue->private_data); +} + + +/* used when an event triggers a dead queue */ +static void queue_dead(struct event_context *ev, struct timed_event *te, + struct timeval t, void *private_data) +{ + struct ctdb_queue *queue = talloc_get_type(private_data, struct ctdb_queue); + queue->callback(NULL, 0, queue->private_data); +} + + +/* + called when an incoming connection is writeable +*/ +static void queue_io_write(struct ctdb_queue *queue) +{ + while (queue->out_queue) { + struct ctdb_queue_pkt *pkt = queue->out_queue; + ssize_t n; + if (queue->ctdb->flags & CTDB_FLAG_TORTURE) { + n = write(queue->fd, pkt->data, 1); + } else { + n = write(queue->fd, pkt->data, pkt->length); + } + + if (n == -1 && errno != EAGAIN && errno != EWOULDBLOCK) { + if (pkt->length != pkt->full_length) { + /* partial packet sent - we have to drop it */ + DLIST_REMOVE(queue->out_queue, pkt); + talloc_free(pkt); + } + talloc_free(queue->fde); + queue->fde = NULL; + queue->fd = -1; + event_add_timed(queue->ctdb->ev, queue, timeval_zero(), + queue_dead, queue); + return; + } + if (n <= 0) return; + + if (n != pkt->length) { + pkt->length -= n; + pkt->data += n; + return; + } + + DLIST_REMOVE(queue->out_queue, pkt); + talloc_free(pkt); + } + + EVENT_FD_NOT_WRITEABLE(queue->fde); +} + +/* + called when an incoming connection is readable or writeable +*/ +static void queue_io_handler(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private_data) +{ + struct ctdb_queue *queue = talloc_get_type(private_data, struct ctdb_queue); + + if (flags & EVENT_FD_READ) { + queue_io_read(queue); + } else { + queue_io_write(queue); + } +} + + +/* + queue a packet for sending +*/ +int ctdb_queue_send(struct ctdb_queue *queue, uint8_t *data, uint32_t length) +{ + struct ctdb_queue_pkt *pkt; + uint32_t length2, full_length; + + if (queue->alignment) { + /* enforce the length and alignment rules from the tcp packet allocator */ + length2 = (length+(queue->alignment-1)) & ~(queue->alignment-1); + *(uint32_t *)data = length2; + } else { + length2 = length; + } + + if (length2 != length) { + memset(data+length, 0, length2-length); + } + + full_length = length2; + + /* if the queue is empty then try an immediate write, avoiding + queue overhead. This relies on non-blocking sockets */ + if (queue->out_queue == NULL && queue->fd != -1 && + !(queue->ctdb->flags & CTDB_FLAG_TORTURE)) { + ssize_t n = write(queue->fd, data, length2); + if (n == -1 && errno != EAGAIN && errno != EWOULDBLOCK) { + talloc_free(queue->fde); + queue->fde = NULL; + queue->fd = -1; + event_add_timed(queue->ctdb->ev, queue, timeval_zero(), + queue_dead, queue); + /* yes, we report success, as the dead node is + handled via a separate event */ + return 0; + } + if (n > 0) { + data += n; + length2 -= n; + } + if (length2 == 0) return 0; + } + + pkt = talloc(queue, struct ctdb_queue_pkt); + CTDB_NO_MEMORY(queue->ctdb, pkt); + + pkt->data = (uint8_t *)talloc_memdup(pkt, data, length2); + CTDB_NO_MEMORY(queue->ctdb, pkt->data); + + pkt->length = length2; + pkt->full_length = full_length; + + if (queue->out_queue == NULL && queue->fd != -1) { + EVENT_FD_WRITEABLE(queue->fde); + } + + DLIST_ADD_END(queue->out_queue, pkt, struct ctdb_queue_pkt *); + + return 0; +} + + +/* + setup the fd used by the queue + */ +int ctdb_queue_set_fd(struct ctdb_queue *queue, int fd) +{ + queue->fd = fd; + talloc_free(queue->fde); + queue->fde = NULL; + + if (fd != -1) { + queue->fde = event_add_fd(queue->ctdb->ev, queue, fd, EVENT_FD_READ|EVENT_FD_AUTOCLOSE, + queue_io_handler, queue); + if (queue->fde == NULL) { + return -1; + } + + if (queue->out_queue) { + EVENT_FD_WRITEABLE(queue->fde); + } + } + + return 0; +} + + + +/* + setup a packet queue on a socket + */ +struct ctdb_queue *ctdb_queue_setup(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, int fd, int alignment, + + ctdb_queue_cb_fn_t callback, + void *private_data) +{ + struct ctdb_queue *queue; + + queue = talloc_zero(mem_ctx, struct ctdb_queue); + CTDB_NO_MEMORY_NULL(ctdb, queue); + + queue->ctdb = ctdb; + queue->fd = fd; + queue->alignment = alignment; + queue->private_data = private_data; + queue->callback = callback; + if (fd != -1) { + if (ctdb_queue_set_fd(queue, fd) != 0) { + talloc_free(queue); + return NULL; + } + } + + return queue; +} diff --git a/source4/cluster/ctdb/common/ctdb_ltdb.c b/source4/cluster/ctdb/common/ctdb_ltdb.c new file mode 100644 index 0000000000..92adc4a12a --- /dev/null +++ b/source4/cluster/ctdb/common/ctdb_ltdb.c @@ -0,0 +1,178 @@ +/* + ctdb ltdb code + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "lib/tdb/include/tdb.h" +#include "system/network.h" +#include "system/filesys.h" +#include "../include/ctdb_private.h" +#include "tdb_wrap.h" +#include "lib/util/dlinklist.h" + +/* + find an attached ctdb_db handle given a name + */ +struct ctdb_db_context *ctdb_db_handle(struct ctdb_context *ctdb, const char *name) +{ + struct ctdb_db_context *tmp_db; + for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) { + if (strcmp(name, tmp_db->db_name) == 0) { + return tmp_db; + } + } + return NULL; +} + + +/* + return the lmaster given a key +*/ +uint32_t ctdb_lmaster(struct ctdb_context *ctdb, const TDB_DATA *key) +{ + uint32_t idx, lmaster; + + idx = ctdb_hash(key) % ctdb->vnn_map->size; + lmaster = ctdb->vnn_map->map[idx]; + + return lmaster; +} + + +/* + construct an initial header for a record with no ltdb header yet +*/ +static void ltdb_initial_header(struct ctdb_db_context *ctdb_db, + TDB_DATA key, + struct ctdb_ltdb_header *header) +{ + header->rsn = 0; + /* initial dmaster is the lmaster */ + header->dmaster = ctdb_lmaster(ctdb_db->ctdb, &key); + header->laccessor = header->dmaster; + header->lacount = 0; +} + + +/* + fetch a record from the ltdb, separating out the header information + and returning the body of the record. A valid (initial) header is + returned if the record is not present +*/ +int ctdb_ltdb_fetch(struct ctdb_db_context *ctdb_db, + TDB_DATA key, struct ctdb_ltdb_header *header, + TALLOC_CTX *mem_ctx, TDB_DATA *data) +{ + TDB_DATA rec; + struct ctdb_context *ctdb = ctdb_db->ctdb; + + rec = tdb_fetch(ctdb_db->ltdb->tdb, key); + if (rec.dsize < sizeof(*header)) { + TDB_DATA d2; + /* return an initial header */ + if (rec.dptr) free(rec.dptr); + if (ctdb->vnn_map == NULL) { + /* called from the client */ + ZERO_STRUCTP(data); + header->dmaster = (uint32_t)-1; + return -1; + } + ltdb_initial_header(ctdb_db, key, header); + ZERO_STRUCT(d2); + if (data) { + *data = d2; + } + ctdb_ltdb_store(ctdb_db, key, header, d2); + return 0; + } + + *header = *(struct ctdb_ltdb_header *)rec.dptr; + + if (data) { + data->dsize = rec.dsize - sizeof(struct ctdb_ltdb_header); + data->dptr = (unsigned char *)talloc_memdup(mem_ctx, + sizeof(struct ctdb_ltdb_header)+rec.dptr, + data->dsize); + } + + free(rec.dptr); + if (data) { + CTDB_NO_MEMORY(ctdb, data->dptr); + } + + return 0; +} + + +/* + fetch a record from the ltdb, separating out the header information + and returning the body of the record. A valid (initial) header is + returned if the record is not present +*/ +int ctdb_ltdb_store(struct ctdb_db_context *ctdb_db, TDB_DATA key, + struct ctdb_ltdb_header *header, TDB_DATA data) +{ + struct ctdb_context *ctdb = ctdb_db->ctdb; + TDB_DATA rec; + int ret; + + if (ctdb->flags & CTDB_FLAG_TORTURE) { + struct ctdb_ltdb_header *h2; + rec = tdb_fetch(ctdb_db->ltdb->tdb, key); + h2 = (struct ctdb_ltdb_header *)rec.dptr; + if (rec.dptr && rec.dsize >= sizeof(h2) && h2->rsn > header->rsn) { + DEBUG(0,("RSN regression! %llu %llu\n", + (unsigned long long)h2->rsn, (unsigned long long)header->rsn)); + } + if (rec.dptr) free(rec.dptr); + } + + rec.dsize = sizeof(*header) + data.dsize; + rec.dptr = (unsigned char *)talloc_size(ctdb, rec.dsize); + CTDB_NO_MEMORY(ctdb, rec.dptr); + + memcpy(rec.dptr, header, sizeof(*header)); + memcpy(rec.dptr + sizeof(*header), data.dptr, data.dsize); + + ret = tdb_store(ctdb_db->ltdb->tdb, key, rec, TDB_REPLACE); + talloc_free(rec.dptr); + + return ret; +} + + +/* + lock a record in the ltdb, given a key + */ +int ctdb_ltdb_lock(struct ctdb_db_context *ctdb_db, TDB_DATA key) +{ + return tdb_chainlock(ctdb_db->ltdb->tdb, key); +} + +/* + unlock a record in the ltdb, given a key + */ +int ctdb_ltdb_unlock(struct ctdb_db_context *ctdb_db, TDB_DATA key) +{ + int ret = tdb_chainunlock(ctdb_db->ltdb->tdb, key); + if (ret != 0) { + DEBUG(0,("tdb_chainunlock failed\n")); + } + return ret; +} diff --git a/source4/cluster/ctdb/common/ctdb_message.c b/source4/cluster/ctdb/common/ctdb_message.c new file mode 100644 index 0000000000..1aea28fd35 --- /dev/null +++ b/source4/cluster/ctdb/common/ctdb_message.c @@ -0,0 +1,112 @@ +/* + ctdb_message protocol code + + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +/* + see http://wiki.samba.org/index.php/Samba_%26_Clustering for + protocol design and packet details +*/ +#include "includes.h" +#include "lib/events/events.h" +#include "lib/tdb/include/tdb.h" +#include "system/network.h" +#include "system/filesys.h" +#include "../include/ctdb_private.h" +#include "lib/util/dlinklist.h" + +/* + this dispatches the messages to the registered ctdb message handler +*/ +int ctdb_dispatch_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data) +{ + struct ctdb_message_list *ml; + + for (ml=ctdb->message_list;ml;ml=ml->next) { + if (ml->srvid == srvid || ml->srvid == CTDB_SRVID_ALL) { + ml->message_handler(ctdb, srvid, data, ml->message_private); + } + } + + return 0; +} + +/* + called when a CTDB_REQ_MESSAGE packet comes in +*/ +void ctdb_request_message(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_req_message *c = (struct ctdb_req_message *)hdr; + TDB_DATA data; + + data.dptr = &c->data[0]; + data.dsize = c->datalen; + + ctdb_dispatch_message(ctdb, c->srvid, data); +} + + +/* + when a client goes away, we need to remove its srvid handler from the list + */ +static int message_handler_destructor(struct ctdb_message_list *m) +{ + DLIST_REMOVE(m->ctdb->message_list, m); + return 0; +} + +/* + setup handler for receipt of ctdb messages from ctdb_send_message() +*/ +int ctdb_register_message_handler(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, + uint64_t srvid, + ctdb_message_fn_t handler, + void *private_data) +{ + struct ctdb_message_list *m; + + m = talloc(mem_ctx, struct ctdb_message_list); + CTDB_NO_MEMORY(ctdb, m); + + m->ctdb = ctdb; + m->srvid = srvid; + m->message_handler = handler; + m->message_private = private_data; + + DLIST_ADD(ctdb->message_list, m); + + talloc_set_destructor(m, message_handler_destructor); + + return 0; +} + + +/* + setup handler for receipt of ctdb messages from ctdb_send_message() +*/ +int ctdb_deregister_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void *private_data) +{ + struct ctdb_message_list *m; + + for (m=ctdb->message_list;m;m=m->next) { + if (m->srvid == srvid && m->message_private == private_data) { + talloc_free(m); + return 0; + } + } + return -1; +} diff --git a/source4/cluster/ctdb/common/ctdb_util.c b/source4/cluster/ctdb/common/ctdb_util.c new file mode 100644 index 0000000000..f11388331d --- /dev/null +++ b/source4/cluster/ctdb/common/ctdb_util.c @@ -0,0 +1,285 @@ +/* + ctdb utility code + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "lib/tdb/include/tdb.h" +#include "system/network.h" +#include "system/filesys.h" +#include "../include/ctdb_private.h" + +int LogLevel; + +/* + return error string for last error +*/ +const char *ctdb_errstr(struct ctdb_context *ctdb) +{ + return ctdb->err_msg; +} + + +/* + remember an error message +*/ +void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...) +{ + va_list ap; + talloc_free(ctdb->err_msg); + va_start(ap, fmt); + ctdb->err_msg = talloc_vasprintf(ctdb, fmt, ap); + DEBUG(0,("ctdb error: %s\n", ctdb->err_msg)); + va_end(ap); +} + +/* + a fatal internal error occurred - no hope for recovery +*/ +_NORETURN_ void ctdb_fatal(struct ctdb_context *ctdb, const char *msg) +{ + DEBUG(0,("ctdb fatal error: %s\n", msg)); + abort(); +} + +/* + parse a IP:port pair +*/ +int ctdb_parse_address(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, const char *str, + struct ctdb_address *address) +{ + struct servent *se; + + setservent(0); + se = getservbyname("ctdb", "tcp"); + endservent(); + + address->address = talloc_strdup(mem_ctx, str); + if (se == NULL) { + address->port = CTDB_PORT; + } else { + address->port = ntohs(se->s_port); + } + return 0; +} + + +/* + check if two addresses are the same +*/ +bool ctdb_same_address(struct ctdb_address *a1, struct ctdb_address *a2) +{ + return strcmp(a1->address, a2->address) == 0 && a1->port == a2->port; +} + + +/* + hash function for mapping data to a VNN - taken from tdb +*/ +uint32_t ctdb_hash(const TDB_DATA *key) +{ + uint32_t value; /* Used to compute the hash value. */ + uint32_t i; /* Used to cycle through random values. */ + + /* Set the initial value from the key size. */ + for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++) + value = (value + (key->dptr[i] << (i*5 % 24))); + + return (1103515243 * value + 12345); +} + +/* + a type checking varient of idr_find + */ +static void *_idr_find_type(struct idr_context *idp, int id, const char *type, const char *location) +{ + void *p = idr_find(idp, id); + if (p && talloc_check_name(p, type) == NULL) { + DEBUG(0,("%s idr_find_type expected type %s but got %s\n", + location, type, talloc_get_name(p))); + return NULL; + } + return p; +} + + +/* + update a max latency number + */ +void ctdb_latency(double *latency, struct timeval t) +{ + double l = timeval_elapsed(&t); + if (l > *latency) { + *latency = l; + } +} + +uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state) +{ + uint32_t id; + + id = ctdb->idr_cnt++ & 0xFFFF; + id |= (idr_get_new(ctdb->idr, state, 0xFFFF)<<16); + return id; +} + +void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *type, const char *location) +{ + void *p; + + p = _idr_find_type(ctdb->idr, (reqid>>16)&0xFFFF, type, location); + if (p == NULL) { + DEBUG(0, ("Could not find idr:%u\n",reqid)); + } + + return p; +} + + +void ctdb_reqid_remove(struct ctdb_context *ctdb, uint32_t reqid) +{ + int ret; + + ret = idr_remove(ctdb->idr, (reqid>>16)&0xFFFF); + if (ret != 0) { + DEBUG(0, ("Removing idr that does not exist\n")); + } +} + + +/* + form a ctdb_rec_data record from a key/data pair + */ +struct ctdb_rec_data *ctdb_marshall_record(TALLOC_CTX *mem_ctx, uint32_t reqid, TDB_DATA key, TDB_DATA data) +{ + size_t length; + struct ctdb_rec_data *d; + + length = offsetof(struct ctdb_rec_data, data) + key.dsize + data.dsize; + d = (struct ctdb_rec_data *)talloc_size(mem_ctx, length); + if (d == NULL) { + return NULL; + } + d->length = length; + d->reqid = reqid; + d->keylen = key.dsize; + d->datalen = data.dsize; + memcpy(&d->data[0], key.dptr, key.dsize); + memcpy(&d->data[key.dsize], data.dptr, data.dsize); + return d; +} + +#if HAVE_SCHED_H +#include <sched.h> +#endif + +/* + if possible, make this task real time + */ +void ctdb_set_scheduler(struct ctdb_context *ctdb) +{ +#if HAVE_SCHED_SETSCHEDULER + struct sched_param p; + if (ctdb->saved_scheduler_param == NULL) { + ctdb->saved_scheduler_param = talloc_size(ctdb, sizeof(p)); + } + + if (sched_getparam(0, (struct sched_param *)ctdb->saved_scheduler_param) == -1) { + DEBUG(0,("Unable to get old scheduler params\n")); + return; + } + + p = *(struct sched_param *)ctdb->saved_scheduler_param; + p.sched_priority = 1; + + if (sched_setscheduler(0, SCHED_FIFO, &p) == -1) { + DEBUG(0,("Unable to set scheduler to SCHED_FIFO (%s)\n", + strerror(errno))); + } else { + DEBUG(0,("Set scheduler to SCHED_FIFO\n")); + } +#endif +} + +/* + restore previous scheduler parameters + */ +void ctdb_restore_scheduler(struct ctdb_context *ctdb) +{ +#if HAVE_SCHED_SETSCHEDULER + if (ctdb->saved_scheduler_param == NULL) { + ctdb_fatal(ctdb, "No saved scheduler parameters\n"); + } + if (sched_setscheduler(0, SCHED_OTHER, (struct sched_param *)ctdb->saved_scheduler_param) == -1) { + ctdb_fatal(ctdb, "Unable to restore old scheduler parameters\n"); + } +#endif +} + +void set_nonblocking(int fd) +{ + unsigned v; + v = fcntl(fd, F_GETFL, 0); + fcntl(fd, F_SETFL, v | O_NONBLOCK); +} + +void set_close_on_exec(int fd) +{ + unsigned v; + v = fcntl(fd, F_GETFD, 0); + fcntl(fd, F_SETFD, v | FD_CLOEXEC); +} + + +/* + parse a ip:port pair + */ +bool parse_ip_port(const char *s, struct sockaddr_in *ip) +{ + const char *p; + char *endp = NULL; + unsigned port; + char buf[16]; + + ip->sin_family = AF_INET; + + p = strchr(s, ':'); + if (p == NULL) { + return false; + } + + if (p - s > 15) { + return false; + } + + port = strtoul(p+1, &endp, 10); + if (endp == NULL || *endp != 0) { + /* trailing garbage */ + return false; + } + ip->sin_port = htons(port); + + strlcpy(buf, s, 1+p-s); + + if (inet_aton(buf, &ip->sin_addr) == 0) { + return false; + } + + return true; +} diff --git a/source4/cluster/ctdb/common/system.c b/source4/cluster/ctdb/common/system.c new file mode 100644 index 0000000000..1e536f5e8a --- /dev/null +++ b/source4/cluster/ctdb/common/system.c @@ -0,0 +1,385 @@ +/* + ctdb recovery code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/wait.h" +#include "../include/ctdb_private.h" +#include "lib/events/events.h" +#include <net/ethernet.h> +#include <net/if_arp.h> + + + +/* + send gratuitous arp reply after we have taken over an ip address + + saddr is the address we are trying to claim + iface is the interface name we will be using to claim the address + */ +int ctdb_sys_send_arp(const struct sockaddr_in *saddr, const char *iface) +{ + int s, ret; + struct sockaddr sa; + struct ether_header *eh; + struct arphdr *ah; + struct ifreq if_hwaddr; + unsigned char buffer[64]; /*minimum eth frame size */ + char *ptr; + + /* for now, we only handle AF_INET addresses */ + if (saddr->sin_family != AF_INET) { + DEBUG(0,(__location__ " not an ipv4 address (family is %u)\n", saddr->sin_family)); + return -1; + } + + s = socket(AF_INET, SOCK_PACKET, htons(ETHERTYPE_ARP)); + if (s == -1){ + DEBUG(0,(__location__ " failed to open raw socket\n")); + return -1; + } + + /* get the mac address */ + strcpy(if_hwaddr.ifr_name, iface); + ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr); + if ( ret < 0 ) { + close(s); + DEBUG(0,(__location__ " ioctl failed\n")); + return -1; + } + if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) { + DEBUG(3,("Ignoring loopback arp request\n")); + close(s); + return 0; + } + if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) { + close(s); + errno = EINVAL; + DEBUG(0,(__location__ " not an ethernet address family (0x%x)\n", + if_hwaddr.ifr_hwaddr.sa_family)); + return -1; + } + + + memset(buffer, 0 , 64); + eh = (struct ether_header *)buffer; + memset(eh->ether_dhost, 0xff, ETH_ALEN); + memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN); + eh->ether_type = htons(ETHERTYPE_ARP); + + ah = (struct arphdr *)&buffer[sizeof(struct ether_header)]; + ah->ar_hrd = htons(ARPHRD_ETHER); + ah->ar_pro = htons(ETH_P_IP); + ah->ar_hln = ETH_ALEN; + ah->ar_pln = 4; + + /* send a gratious arp */ + ah->ar_op = htons(ARPOP_REQUEST); + ptr = (char *)&ah[1]; + memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN); + ptr+=ETH_ALEN; + memcpy(ptr, &saddr->sin_addr, 4); + ptr+=4; + memset(ptr, 0, ETH_ALEN); + ptr+=ETH_ALEN; + memcpy(ptr, &saddr->sin_addr, 4); + ptr+=4; + + strncpy(sa.sa_data, iface, sizeof(sa.sa_data)); + ret = sendto(s, buffer, 64, 0, &sa, sizeof(sa)); + if (ret < 0 ){ + close(s); + DEBUG(0,(__location__ " failed sendto\n")); + return -1; + } + + /* send unsolicited arp reply broadcast */ + ah->ar_op = htons(ARPOP_REPLY); + ptr = (char *)&ah[1]; + memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN); + ptr+=ETH_ALEN; + memcpy(ptr, &saddr->sin_addr, 4); + ptr+=4; + memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN); + ptr+=ETH_ALEN; + memcpy(ptr, &saddr->sin_addr, 4); + ptr+=4; + + strncpy(sa.sa_data, iface, sizeof(sa.sa_data)); + ret = sendto(s, buffer, 64, 0, &sa, sizeof(sa)); + if (ret < 0 ){ + DEBUG(0,(__location__ " failed sendto\n")); + return -1; + } + + close(s); + return 0; +} + + +/* + uint16 checksum for n bytes + */ +static uint32_t uint16_checksum(uint16_t *data, size_t n) +{ + uint32_t sum=0; + while (n>=2) { + sum += (uint32_t)ntohs(*data); + data++; + n -= 2; + } + if (n == 1) { + sum += (uint32_t)ntohs(*(uint8_t *)data); + } + return sum; +} + +/* + simple TCP checksum - assumes data is multiple of 2 bytes long + */ +static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip) +{ + uint32_t sum = uint16_checksum(data, n); + uint16_t sum2; + sum += uint16_checksum((uint16_t *)&ip->saddr, sizeof(ip->saddr)); + sum += uint16_checksum((uint16_t *)&ip->daddr, sizeof(ip->daddr)); + sum += ip->protocol + n; + sum = (sum & 0xFFFF) + (sum >> 16); + sum = (sum & 0xFFFF) + (sum >> 16); + sum2 = htons(sum); + sum2 = ~sum2; + if (sum2 == 0) { + return 0xFFFF; + } + return sum2; +} + +/* + Send tcp segment from the specified IP/port to the specified + destination IP/port. + + This is used to trigger the receiving host into sending its own ACK, + which should trigger early detection of TCP reset by the client + after IP takeover + + This can also be used to send RST segments (if rst is true) and also + if correct seq and ack numbers are provided. + */ +int ctdb_sys_send_tcp(const struct sockaddr_in *dest, + const struct sockaddr_in *src, + uint32_t seq, uint32_t ack, int rst) +{ + int s, ret; + uint32_t one = 1; + struct { + struct iphdr ip; + struct tcphdr tcp; + } pkt; + + /* for now, we only handle AF_INET addresses */ + if (src->sin_family != AF_INET || dest->sin_family != AF_INET) { + DEBUG(0,(__location__ " not an ipv4 address\n")); + return -1; + } + + s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW)); + if (s == -1) { + DEBUG(0,(__location__ " failed to open raw socket (%s)\n", + strerror(errno))); + return -1; + } + + ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one)); + if (ret != 0) { + DEBUG(0,(__location__ " failed to setup IP headers (%s)\n", + strerror(errno))); + close(s); + return -1; + } + + ZERO_STRUCT(pkt); + pkt.ip.version = 4; + pkt.ip.ihl = sizeof(pkt.ip)/4; + pkt.ip.tot_len = htons(sizeof(pkt)); + pkt.ip.ttl = 255; + pkt.ip.protocol = IPPROTO_TCP; + pkt.ip.saddr = src->sin_addr.s_addr; + pkt.ip.daddr = dest->sin_addr.s_addr; + pkt.ip.check = 0; + + pkt.tcp.source = src->sin_port; + pkt.tcp.dest = dest->sin_port; + pkt.tcp.seq = seq; + pkt.tcp.ack_seq = ack; + pkt.tcp.ack = 1; + if (rst) { + pkt.tcp.rst = 1; + } + pkt.tcp.doff = sizeof(pkt.tcp)/4; + pkt.tcp.window = htons(1234); + pkt.tcp.check = tcp_checksum((uint16_t *)&pkt.tcp, sizeof(pkt.tcp), &pkt.ip); + + ret = sendto(s, &pkt, sizeof(pkt), 0, dest, sizeof(*dest)); + if (ret != sizeof(pkt)) { + DEBUG(0,(__location__ " failed sendto (%s)\n", strerror(errno))); + close(s); + return -1; + } + + close(s); + return 0; +} + + +/* + see if we currently have an interface with the given IP + + we try to bind to it, and if that fails then we don't have that IP + on an interface + */ +bool ctdb_sys_have_ip(const char *ip) +{ + struct sockaddr_in sin; + int s; + int ret; + + sin.sin_port = 0; + inet_aton(ip, &sin.sin_addr); + sin.sin_family = AF_INET; + s = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s == -1) { + return false; + } + ret = bind(s, (struct sockaddr *)&sin, sizeof(sin)); + close(s); + return ret == 0; +} + +static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te, + struct timeval yt, void *p) +{ + uint32_t *timed_out = (uint32_t *)p; + (*timed_out) = 1; +} + +/* This function is used to kill (RST) the specified tcp connection. + + This function is not asynchronous and will block until the operation + was successful or it timesout. + */ +int ctdb_sys_kill_tcp(struct event_context *ev, + const struct sockaddr_in *dst, + const struct sockaddr_in *src) +{ + int s, ret; + uint32_t timedout; + TALLOC_CTX *tmp_ctx = talloc_new(NULL); +#define RCVPKTSIZE 100 + char pkt[RCVPKTSIZE]; + struct ether_header *eth; + struct iphdr *ip; + struct tcphdr *tcp; + + /* Open a socket to capture all traffic */ + s=socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (s == -1){ + DEBUG(0,(__location__ " failed to open raw socket\n")); + return -1; + } + + /* We wait for up to 1 second for the ACK coming back */ + timedout = 0; + event_add_timed(ev, tmp_ctx, timeval_current_ofs(1, 0), ctdb_wait_handler, &timedout); + + /* Send a tickle ack to probe what the real seq/ack numbers are */ + ctdb_sys_send_tcp(dst, src, 0, 0, 0); + + /* Wait until we either time out or we succeeds in sending the RST */ + while (timedout==0) { + event_loop_once(ev); + + ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC); + if (ret < sizeof(*eth)+sizeof(*ip)) { + continue; + } + + /* Ethernet */ + eth = (struct ether_header *)pkt; + /* We only want IP packets */ + if (ntohs(eth->ether_type) != ETHERTYPE_IP) { + continue; + } + + /* IP */ + ip = (struct iphdr *)(eth+1); + /* We only want IPv4 packets */ + if (ip->version != 4) { + continue; + } + /* Dont look at fragments */ + if ((ntohs(ip->frag_off)&0x1fff) != 0) { + continue; + } + /* we only want TCP */ + if (ip->protocol != IPPROTO_TCP) { + continue; + } + + /* We only want packets sent from the guy we tickled */ + if (ip->saddr != dst->sin_addr.s_addr) { + continue; + } + /* We only want packets sent to us */ + if (ip->daddr != src->sin_addr.s_addr) { + continue; + } + + /* make sure its not a short packet */ + if (offsetof(struct tcphdr, ack_seq) + 4 + + (ip->ihl*4) + sizeof(*eth) > ret) { + continue; + } + + /* TCP */ + tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip); + + /* We only want replies from the port we tickled */ + if (tcp->source != dst->sin_port) { + continue; + } + if (tcp->dest != src->sin_port) { + continue; + } + + ctdb_sys_send_tcp(dst, src, tcp->ack_seq, tcp->seq, 1); + + close(s); + talloc_free(tmp_ctx); + + return 0; + } + + close(s); + talloc_free(tmp_ctx); + DEBUG(0,(__location__ " timedout waiting for tickle ack reply\n")); + + return -1; +} |