diff options
Diffstat (limited to 'source4/cluster/ctdb/tcp')
-rw-r--r-- | source4/cluster/ctdb/tcp/ctdb_tcp.h | 54 | ||||
-rw-r--r-- | source4/cluster/ctdb/tcp/tcp_connect.c | 370 | ||||
-rw-r--r-- | source4/cluster/ctdb/tcp/tcp_init.c | 140 | ||||
-rw-r--r-- | source4/cluster/ctdb/tcp/tcp_io.c | 89 |
4 files changed, 653 insertions, 0 deletions
diff --git a/source4/cluster/ctdb/tcp/ctdb_tcp.h b/source4/cluster/ctdb/tcp/ctdb_tcp.h new file mode 100644 index 0000000000..3a1285f47c --- /dev/null +++ b/source4/cluster/ctdb/tcp/ctdb_tcp.h @@ -0,0 +1,54 @@ +/* + ctdb database library + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + + +/* ctdb_tcp main state */ +struct ctdb_tcp { + int listen_fd; +}; + +/* + state associated with an incoming connection +*/ +struct ctdb_incoming { + struct ctdb_context *ctdb; + int fd; + struct ctdb_queue *queue; +}; + +/* + state associated with one tcp node +*/ +struct ctdb_tcp_node { + int fd; + struct ctdb_queue *out_queue; + struct fd_event *connect_fde; + struct timed_event *connect_te; +}; + + +/* prototypes internal to tcp transport */ +int ctdb_tcp_queue_pkt(struct ctdb_node *node, uint8_t *data, uint32_t length); +int ctdb_tcp_listen(struct ctdb_context *ctdb); +void ctdb_tcp_node_connect(struct event_context *ev, struct timed_event *te, + struct timeval t, void *private_data); +void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); +void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); + +#define CTDB_TCP_ALIGNMENT 8 diff --git a/source4/cluster/ctdb/tcp/tcp_connect.c b/source4/cluster/ctdb/tcp/tcp_connect.c new file mode 100644 index 0000000000..2f828e5717 --- /dev/null +++ b/source4/cluster/ctdb/tcp/tcp_connect.c @@ -0,0 +1,370 @@ +/* + ctdb over TCP + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "lib/tdb/include/tdb.h" +#include "system/network.h" +#include "system/filesys.h" +#include "../include/ctdb_private.h" +#include "ctdb_tcp.h" + +/* + called when a complete packet has come in - should not happen on this socket + */ +void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) +{ + struct ctdb_node *node = talloc_get_type(private_data, struct ctdb_node); + struct ctdb_tcp_node *tnode = talloc_get_type( + node->private_data, struct ctdb_tcp_node); + + if (data == NULL) { + node->ctdb->upcalls->node_dead(node); + } + + /* start a new connect cycle to try to re-establish the + link */ + ctdb_queue_set_fd(tnode->out_queue, -1); + tnode->fd = -1; + event_add_timed(node->ctdb->ev, tnode, timeval_zero(), + ctdb_tcp_node_connect, node); +} + +/* + called when socket becomes writeable on connect +*/ +static void ctdb_node_connect_write(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private_data) +{ + struct ctdb_node *node = talloc_get_type(private_data, + struct ctdb_node); + struct ctdb_tcp_node *tnode = talloc_get_type(node->private_data, + struct ctdb_tcp_node); + struct ctdb_context *ctdb = node->ctdb; + int error = 0; + socklen_t len = sizeof(error); + int one = 1; + + talloc_free(tnode->connect_te); + tnode->connect_te = NULL; + + if (getsockopt(tnode->fd, SOL_SOCKET, SO_ERROR, &error, &len) != 0 || + error != 0) { + talloc_free(fde); + close(tnode->fd); + tnode->fd = -1; + event_add_timed(ctdb->ev, tnode, timeval_current_ofs(1, 0), + ctdb_tcp_node_connect, node); + return; + } + + talloc_free(fde); + + setsockopt(tnode->fd,IPPROTO_TCP,TCP_NODELAY,(char *)&one,sizeof(one)); + setsockopt(tnode->fd,SOL_SOCKET,SO_KEEPALIVE,(char *)&one,sizeof(one)); + + ctdb_queue_set_fd(tnode->out_queue, tnode->fd); + + /* tell the ctdb layer we are connected */ + node->ctdb->upcalls->node_connected(node); +} + + +static int ctdb_tcp_get_address(struct ctdb_context *ctdb, + const char *address, struct in_addr *addr) +{ + if (inet_pton(AF_INET, address, addr) <= 0) { + struct hostent *he = gethostbyname(address); + if (he == NULL || he->h_length > sizeof(*addr)) { + ctdb_set_error(ctdb, "invalid nework address '%s'\n", + address); + return -1; + } + memcpy(addr, he->h_addr, he->h_length); + } + return 0; +} + +/* + called when we should try and establish a tcp connection to a node +*/ +void ctdb_tcp_node_connect(struct event_context *ev, struct timed_event *te, + struct timeval t, void *private_data) +{ + struct ctdb_node *node = talloc_get_type(private_data, + struct ctdb_node); + struct ctdb_tcp_node *tnode = talloc_get_type(node->private_data, + struct ctdb_tcp_node); + struct ctdb_context *ctdb = node->ctdb; + struct sockaddr_in sock_in; + struct sockaddr_in sock_out; + + if (tnode->fd != -1) { + talloc_free(tnode->connect_fde); + tnode->connect_fde = NULL; + close(tnode->fd); + tnode->fd = -1; + } + + tnode->fd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + + set_nonblocking(tnode->fd); + set_close_on_exec(tnode->fd); + + ZERO_STRUCT(sock_out); +#ifdef HAVE_SOCK_SIN_LEN + sock_out.sin_len = sizeof(sock_out); +#endif + if (ctdb_tcp_get_address(ctdb, node->address.address, &sock_out.sin_addr) != 0) { + return; + } + sock_out.sin_port = htons(node->address.port); + sock_out.sin_family = PF_INET; + + + /* Bind our side of the socketpair to the same address we use to listen + * on incoming CTDB traffic. + * We must specify this address to make sure that the address we expose to + * the remote side is actually routable in case CTDB traffic will run on + * a dedicated non-routeable network. + */ + ZERO_STRUCT(sock_in); +#ifdef HAVE_SOCK_SIN_LEN + sock_in.sin_len = sizeof(sock_in); +#endif + if (ctdb_tcp_get_address(ctdb, ctdb->address.address, &sock_in.sin_addr) != 0) { + return; + } + sock_in.sin_port = htons(0); /* INPORT_ANY is not always available */ + sock_in.sin_family = PF_INET; + bind(tnode->fd, (struct sockaddr *)&sock_in, sizeof(sock_in)); + + if (connect(tnode->fd, (struct sockaddr *)&sock_out, sizeof(sock_out)) != 0 && + errno != EINPROGRESS) { + /* try again once a second */ + close(tnode->fd); + tnode->fd = -1; + event_add_timed(ctdb->ev, tnode, timeval_current_ofs(1, 0), + ctdb_tcp_node_connect, node); + return; + } + + /* non-blocking connect - wait for write event */ + tnode->connect_fde = event_add_fd(node->ctdb->ev, tnode, tnode->fd, + EVENT_FD_WRITE|EVENT_FD_READ, + ctdb_node_connect_write, node); + + /* don't give it long to connect - retry in one second. This ensures + that we find a node is up quickly (tcp normally backs off a syn reply + delay by quite a lot) */ + tnode->connect_te = event_add_timed(ctdb->ev, tnode, timeval_current_ofs(1, 0), + ctdb_tcp_node_connect, node); +} + +/* + called when we get contacted by another node + currently makes no attempt to check if the connection is really from a ctdb + node in our cluster +*/ +static void ctdb_listen_event(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private_data, struct ctdb_tcp); + struct sockaddr_in addr; + socklen_t len; + int fd, nodeid; + struct ctdb_incoming *in; + int one = 1; + const char *incoming_node; + + memset(&addr, 0, sizeof(addr)); + len = sizeof(addr); + fd = accept(ctcp->listen_fd, (struct sockaddr *)&addr, &len); + if (fd == -1) return; + + incoming_node = inet_ntoa(addr.sin_addr); + for (nodeid=0;nodeid<ctdb->num_nodes;nodeid++) { + if (!strcmp(incoming_node, ctdb->nodes[nodeid]->address.address)) { + DEBUG(0, ("Incoming connection from node:%d %s\n",nodeid,incoming_node)); + break; + } + } + if (nodeid>=ctdb->num_nodes) { + DEBUG(0, ("Refused connection from unknown node %s\n", incoming_node)); + close(fd); + return; + } + + in = talloc_zero(ctcp, struct ctdb_incoming); + in->fd = fd; + in->ctdb = ctdb; + + set_nonblocking(in->fd); + set_close_on_exec(in->fd); + + setsockopt(in->fd,SOL_SOCKET,SO_KEEPALIVE,(char *)&one,sizeof(one)); + + in->queue = ctdb_queue_setup(ctdb, in, in->fd, CTDB_TCP_ALIGNMENT, + ctdb_tcp_read_cb, in); +} + + +/* + automatically find which address to listen on +*/ +static int ctdb_tcp_listen_automatic(struct ctdb_context *ctdb) +{ + struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private_data, + struct ctdb_tcp); + struct sockaddr_in sock; + int lock_fd, i; + const char *lock_path = "/tmp/.ctdb_socket_lock"; + struct flock lock; + + /* in order to ensure that we don't get two nodes with the + same adddress, we must make the bind() and listen() calls + atomic. The SO_REUSEADDR setsockopt only prevents double + binds if the first socket is in LISTEN state */ + lock_fd = open(lock_path, O_RDWR|O_CREAT, 0666); + if (lock_fd == -1) { + DEBUG(0,("Unable to open %s\n", lock_path)); + return -1; + } + + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 1; + lock.l_pid = 0; + + if (fcntl(lock_fd, F_SETLKW, &lock) != 0) { + DEBUG(0,("Unable to lock %s\n", lock_path)); + close(lock_fd); + return -1; + } + + for (i=0;i<ctdb->num_nodes;i++) { + ZERO_STRUCT(sock); +#ifdef HAVE_SOCK_SIN_LEN + sock.sin_len = sizeof(sock); +#endif + sock.sin_port = htons(ctdb->nodes[i]->address.port); + sock.sin_family = PF_INET; + if (ctdb_tcp_get_address(ctdb, ctdb->nodes[i]->address.address, + &sock.sin_addr) != 0) { + continue; + } + + if (bind(ctcp->listen_fd, (struct sockaddr * )&sock, + sizeof(sock)) == 0) { + break; + } + } + + if (i == ctdb->num_nodes) { + DEBUG(0,("Unable to bind to any of the node addresses - giving up\n")); + goto failed; + } + ctdb->address = ctdb->nodes[i]->address; + ctdb->name = talloc_asprintf(ctdb, "%s:%u", + ctdb->address.address, + ctdb->address.port); + ctdb->vnn = ctdb->nodes[i]->vnn; + ctdb->nodes[i]->flags &= ~NODE_FLAGS_DISCONNECTED; + DEBUG(1,("ctdb chose network address %s:%u vnn %u\n", + ctdb->address.address, + ctdb->address.port, + ctdb->vnn)); + + if (listen(ctcp->listen_fd, 10) == -1) { + goto failed; + } + + event_add_fd(ctdb->ev, ctcp, ctcp->listen_fd, EVENT_FD_READ|EVENT_FD_AUTOCLOSE, + ctdb_listen_event, ctdb); + + close(lock_fd); + return 0; + +failed: + close(lock_fd); + close(ctcp->listen_fd); + ctcp->listen_fd = -1; + return -1; +} + + +/* + listen on our own address +*/ +int ctdb_tcp_listen(struct ctdb_context *ctdb) +{ + struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private_data, + struct ctdb_tcp); + struct sockaddr_in sock; + int one = 1; + + ctcp->listen_fd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (ctcp->listen_fd == -1) { + ctdb_set_error(ctdb, "socket failed\n"); + return -1; + } + + set_close_on_exec(ctcp->listen_fd); + + setsockopt(ctcp->listen_fd,SOL_SOCKET,SO_REUSEADDR,(char *)&one,sizeof(one)); + + /* we can either auto-bind to the first available address, or we can + use a specified address */ + if (!ctdb->address.address) { + return ctdb_tcp_listen_automatic(ctdb); + } + + ZERO_STRUCT(sock); +#ifdef HAVE_SOCK_SIN_LEN + sock.sin_len = sizeof(sock); +#endif + sock.sin_port = htons(ctdb->address.port); + sock.sin_family = PF_INET; + + if (ctdb_tcp_get_address(ctdb, ctdb->address.address, + &sock.sin_addr) != 0) { + goto failed; + } + + if (bind(ctcp->listen_fd, (struct sockaddr * )&sock, sizeof(sock)) != 0) { + goto failed; + } + + if (listen(ctcp->listen_fd, 10) == -1) { + goto failed; + } + + event_add_fd(ctdb->ev, ctcp, ctcp->listen_fd, EVENT_FD_READ|EVENT_FD_AUTOCLOSE, + ctdb_listen_event, ctdb); + + return 0; + +failed: + close(ctcp->listen_fd); + ctcp->listen_fd = -1; + return -1; +} + diff --git a/source4/cluster/ctdb/tcp/tcp_init.c b/source4/cluster/ctdb/tcp/tcp_init.c new file mode 100644 index 0000000000..f5d4e4c1d6 --- /dev/null +++ b/source4/cluster/ctdb/tcp/tcp_init.c @@ -0,0 +1,140 @@ +/* + ctdb over TCP + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "lib/tdb/include/tdb.h" +#include "lib/events/events.h" +#include "system/network.h" +#include "system/filesys.h" +#include "../include/ctdb_private.h" +#include "ctdb_tcp.h" + + +/* + initialise tcp portion of a ctdb node +*/ +static int ctdb_tcp_add_node(struct ctdb_node *node) +{ + struct ctdb_tcp *ctcp = talloc_get_type(node->ctdb->private_data, + struct ctdb_tcp); + struct ctdb_tcp_node *tnode; + tnode = talloc_zero(ctcp, struct ctdb_tcp_node); + CTDB_NO_MEMORY(node->ctdb, tnode); + + tnode->fd = -1; + node->private_data = tnode; + + tnode->out_queue = ctdb_queue_setup(node->ctdb, ctcp, tnode->fd, CTDB_TCP_ALIGNMENT, + ctdb_tcp_tnode_cb, node); + + return 0; +} + +/* + initialise transport structures +*/ +static int ctdb_tcp_initialise(struct ctdb_context *ctdb) +{ + int i; + + /* listen on our own address */ + if (ctdb_tcp_listen(ctdb) != 0) return -1; + + for (i=0; i<ctdb->num_nodes; i++) { + if (ctdb_tcp_add_node(ctdb->nodes[i]) != 0) { + DEBUG(0, ("methods->add_node failed at %d\n", i)); + return -1; + } + } + + return 0; +} + +/* + start the protocol going +*/ +static int ctdb_tcp_start(struct ctdb_context *ctdb) +{ + int i; + + /* startup connections to the other servers - will happen on + next event loop */ + for (i=0;i<ctdb->num_nodes;i++) { + struct ctdb_node *node = *(ctdb->nodes + i); + struct ctdb_tcp_node *tnode = talloc_get_type( + node->private_data, struct ctdb_tcp_node); + if (!ctdb_same_address(&ctdb->address, &node->address)) { + event_add_timed(ctdb->ev, tnode, timeval_zero(), + ctdb_tcp_node_connect, node); + } + } + + return 0; +} + + +/* + shutdown the transport +*/ +static void ctdb_tcp_shutdown(struct ctdb_context *ctdb) +{ + struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private_data, + struct ctdb_tcp); + talloc_free(ctcp); + ctdb->private_data = NULL; +} + + +/* + transport packet allocator - allows transport to control memory for packets +*/ +static void *ctdb_tcp_allocate_pkt(TALLOC_CTX *mem_ctx, size_t size) +{ + /* tcp transport needs to round to 8 byte alignment to ensure + that we can use a length header and 64 bit elements in + structures */ + size = (size+(CTDB_TCP_ALIGNMENT-1)) & ~(CTDB_TCP_ALIGNMENT-1); + return talloc_size(mem_ctx, size); +} + + +static const struct ctdb_methods ctdb_tcp_methods = { + .initialise = ctdb_tcp_initialise, + .start = ctdb_tcp_start, + .queue_pkt = ctdb_tcp_queue_pkt, + .add_node = ctdb_tcp_add_node, + .allocate_pkt = ctdb_tcp_allocate_pkt, + .shutdown = ctdb_tcp_shutdown, +}; + +/* + initialise tcp portion of ctdb +*/ +int ctdb_tcp_init(struct ctdb_context *ctdb) +{ + struct ctdb_tcp *ctcp; + ctcp = talloc_zero(ctdb, struct ctdb_tcp); + CTDB_NO_MEMORY(ctdb, ctcp); + + ctcp->listen_fd = -1; + ctdb->private_data = ctcp; + ctdb->methods = &ctdb_tcp_methods; + return 0; +} + diff --git a/source4/cluster/ctdb/tcp/tcp_io.c b/source4/cluster/ctdb/tcp/tcp_io.c new file mode 100644 index 0000000000..c10afb3425 --- /dev/null +++ b/source4/cluster/ctdb/tcp/tcp_io.c @@ -0,0 +1,89 @@ +/* + ctdb over TCP + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "lib/util/dlinklist.h" +#include "lib/tdb/include/tdb.h" +#include "system/network.h" +#include "system/filesys.h" +#include "../include/ctdb_private.h" +#include "ctdb_tcp.h" + + +/* + called when a complete packet has come in + */ +void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args) +{ + struct ctdb_incoming *in = talloc_get_type(args, struct ctdb_incoming); + struct ctdb_req_header *hdr = (struct ctdb_req_header *)data; + + if (data == NULL) { + /* incoming socket has died */ + goto failed; + } + + if (cnt < sizeof(*hdr)) { + DEBUG(0,(__location__ " Bad packet length %u\n", (unsigned)cnt)); + goto failed; + } + + if (cnt & (CTDB_TCP_ALIGNMENT-1)) { + DEBUG(0,(__location__ " Length 0x%x not multiple of alignment\n", + (unsigned)cnt)); + goto failed; + } + + + if (cnt != hdr->length) { + DEBUG(0,(__location__ " Bad header length %u expected %u\n", + (unsigned)hdr->length, (unsigned)cnt)); + goto failed; + } + + if (hdr->ctdb_magic != CTDB_MAGIC) { + DEBUG(0,(__location__ " Non CTDB packet 0x%x rejected\n", + hdr->ctdb_magic)); + goto failed; + } + + if (hdr->ctdb_version != CTDB_VERSION) { + DEBUG(0, (__location__ " Bad CTDB version 0x%x rejected\n", + hdr->ctdb_version)); + goto failed; + } + + /* tell the ctdb layer above that we have a packet */ + in->ctdb->upcalls->recv_pkt(in->ctdb, data, cnt); + return; + +failed: + talloc_free(in); +} + +/* + queue a packet for sending +*/ +int ctdb_tcp_queue_pkt(struct ctdb_node *node, uint8_t *data, uint32_t length) +{ + struct ctdb_tcp_node *tnode = talloc_get_type(node->private_data, + struct ctdb_tcp_node); + return ctdb_queue_send(tnode->out_queue, data, length); +} |