diff options
-rw-r--r-- | source3/modules/vfs_aio_linux.c | 643 |
1 files changed, 100 insertions, 543 deletions
diff --git a/source3/modules/vfs_aio_linux.c b/source3/modules/vfs_aio_linux.c index 7b739429e4..b685cdcad2 100644 --- a/source3/modules/vfs_aio_linux.c +++ b/source3/modules/vfs_aio_linux.c @@ -2,6 +2,7 @@ * Simulate Posix AIO using Linux kernel AIO. * * Copyright (C) Jeremy Allison 2012 + * Copyright (C) Volker Lendecke 2012 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,33 +23,19 @@ #include "system/filesys.h" #include "smbd/smbd.h" #include "smbd/globals.h" +#include "lib/util/tevent_unix.h" #include <sys/eventfd.h> #include <libaio.h> -struct aio_extra; static int event_fd = -1; static io_context_t io_ctx; -static int aio_linux_requestid; -static struct io_event *io_recv_events; static struct fd_event *aio_read_event; +static bool used; +static unsigned num_busy; -struct aio_private_data { - struct aio_private_data *prev, *next; - int requestid; - SMB_STRUCT_AIOCB *aiocb; - struct iocb *event_iocb; - ssize_t ret_size; - int ret_errno; - bool cancelled; -}; - -/* List of outstanding requests we have. */ -static struct aio_private_data *pd_list; - -static void aio_linux_handle_completion(struct event_context *event_ctx, - struct fd_event *event, - uint16 flags, - void *p); +static void aio_linux_done(struct event_context *event_ctx, + struct fd_event *event, + uint16 flags, void *private_data); /************************************************************************ Housekeeping. Cleanup if no activity for 30 seconds. @@ -62,7 +49,9 @@ static void aio_linux_housekeeping(struct tevent_context *event_ctx, /* Remove this timed event handler. */ TALLOC_FREE(te); - if (pd_list != NULL) { + if ((num_busy != 0) || used) { + used = false; + /* Still busy. Look again in 30 seconds. */ (void)tevent_add_timer(event_ctx, NULL, @@ -82,7 +71,6 @@ static void aio_linux_housekeeping(struct tevent_context *event_ctx, } TALLOC_FREE(aio_read_event); - TALLOC_FREE(io_recv_events); } /************************************************************************ @@ -99,7 +87,7 @@ static bool init_aio_linux(struct vfs_handle_struct *handle) } /* Schedule a shutdown event for 30 seconds from now. */ - te = tevent_add_timer(server_event_context(), + te = tevent_add_timer(handle->conn->sconn->ev_ctx, NULL, timeval_current_ofs(30, 0), aio_linux_housekeeping, @@ -109,14 +97,6 @@ static bool init_aio_linux(struct vfs_handle_struct *handle) goto fail; } - /* Ensure we have enough space for aio_pending_size events. */ - io_recv_events = talloc_zero_array(NULL, - struct io_event, - aio_pending_size); - if (io_recv_events == NULL) { - goto fail; - } - event_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); if (event_fd == -1) { goto fail; @@ -126,7 +106,7 @@ static bool init_aio_linux(struct vfs_handle_struct *handle) NULL, event_fd, TEVENT_FD_READ, - aio_linux_handle_completion, + aio_linux_done, NULL); if (aio_read_event == NULL) { goto fail; @@ -146,7 +126,6 @@ static bool init_aio_linux(struct vfs_handle_struct *handle) DEBUG(10,("init_aio_linux: initialization failed\n")); TALLOC_FREE(te); - TALLOC_FREE(io_recv_events); TALLOC_FREE(aio_read_event); if (event_fd != -1) { close(event_fd); @@ -156,183 +135,91 @@ static bool init_aio_linux(struct vfs_handle_struct *handle) return false; } -/************************************************************************ - Private data destructor. -***********************************************************************/ +struct aio_linux_state { + struct iocb event_iocb; + ssize_t ret; + int err; +}; -static int pd_destructor(struct aio_private_data *pd) +static struct tevent_req *aio_linux_pread_send( + struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx, + struct tevent_context *ev, struct files_struct *fsp, + void *data, size_t n, off_t offset) { - DLIST_REMOVE(pd_list, pd); - return 0; -} - -/************************************************************************ - Create and initialize a private data struct. -***********************************************************************/ + struct tevent_req *req; + struct aio_linux_state *state; + struct iocb *piocb; + int ret; -static struct aio_private_data *create_private_data(TALLOC_CTX *ctx, - SMB_STRUCT_AIOCB *aiocb) -{ - struct aio_private_data *pd = talloc_zero(ctx, struct aio_private_data); - if (!pd) { + req = tevent_req_create(mem_ctx, &state, struct aio_linux_state); + if (req == NULL) { return NULL; } - pd->event_iocb = talloc_zero(pd, struct iocb); - pd->requestid = aio_linux_requestid++; - pd->aiocb = aiocb; - pd->ret_size = -1; - pd->ret_errno = EINPROGRESS; - talloc_set_destructor(pd, pd_destructor); - DLIST_ADD_END(pd_list, pd, struct aio_private_data *); - return pd; -} - -/************************************************************************ - Initiate an asynchronous pread call. -***********************************************************************/ - -static int aio_linux_read(struct vfs_handle_struct *handle, - struct files_struct *fsp, - SMB_STRUCT_AIOCB *aiocb) -{ - struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr; - struct aio_private_data *pd = NULL; - int ret; - if (!init_aio_linux(handle)) { - return -1; + tevent_req_error(req, EIO); + return tevent_req_post(req, ev); } - pd = create_private_data(aio_ex, aiocb); - if (pd == NULL) { - DEBUG(10, ("aio_linux_read: Could not create private data.\n")); - return -1; - } + io_prep_pread(&state->event_iocb, fsp->fh->fd, data, n, offset); + io_set_eventfd(&state->event_iocb, event_fd); + state->event_iocb.data = req; - io_prep_pread(pd->event_iocb, - pd->aiocb->aio_fildes, - discard_const(pd->aiocb->aio_buf), - pd->aiocb->aio_nbytes, - pd->aiocb->aio_offset); - io_set_eventfd(pd->event_iocb, event_fd); - /* Use the callback pointer as a private data ptr. */ - io_set_callback(pd->event_iocb, (io_callback_t)pd); + piocb = &state->event_iocb; - ret = io_submit(io_ctx, 1, &pd->event_iocb); + ret = io_submit(io_ctx, 1, &piocb); if (ret < 0) { - errno = ret; - return -1; + tevent_req_error(req, -ret); + return tevent_req_post(req, ev); } - - DEBUG(10, ("aio_linux_read: requestid=%d read requested " - "of %llu bytes at offset %llu\n", - pd->requestid, - (unsigned long long)pd->aiocb->aio_nbytes, - (unsigned long long)pd->aiocb->aio_offset)); - - return 0; + num_busy += 1; + used = true; + return req; } -/************************************************************************ - Initiate an asynchronous pwrite call. -***********************************************************************/ - -static int aio_linux_write(struct vfs_handle_struct *handle, - struct files_struct *fsp, - SMB_STRUCT_AIOCB *aiocb) +static struct tevent_req *aio_linux_pwrite_send( + struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx, + struct tevent_context *ev, struct files_struct *fsp, + const void *data, size_t n, off_t offset) { - struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr; - struct aio_private_data *pd = NULL; + struct tevent_req *req; + struct aio_linux_state *state; + struct iocb *piocb; int ret; + req = tevent_req_create(mem_ctx, &state, struct aio_linux_state); + if (req == NULL) { + return NULL; + } if (!init_aio_linux(handle)) { - return -1; + tevent_req_error(req, EIO); + return tevent_req_post(req, ev); } - pd = create_private_data(aio_ex, aiocb); - if (pd == NULL) { - DEBUG(10, ("aio_linux_write: Could not create private data.\n")); - return -1; - } + io_prep_pwrite(&state->event_iocb, fsp->fh->fd, discard_const(data), + n, offset); + io_set_eventfd(&state->event_iocb, event_fd); + state->event_iocb.data = req; - io_prep_pwrite(pd->event_iocb, - pd->aiocb->aio_fildes, - discard_const(pd->aiocb->aio_buf), - pd->aiocb->aio_nbytes, - pd->aiocb->aio_offset); - io_set_eventfd(pd->event_iocb, event_fd); - /* Use the callback pointer as a private data ptr. */ - io_set_callback(pd->event_iocb, (io_callback_t)pd); + piocb = &state->event_iocb; - ret = io_submit(io_ctx, 1, &pd->event_iocb); + ret = io_submit(io_ctx, 1, &piocb); if (ret < 0) { - errno = ret; - return -1; + tevent_req_error(req, -ret); + return tevent_req_post(req, ev); } - - DEBUG(10, ("aio_linux_write: requestid=%d pwrite requested " - "of %llu bytes at offset %llu\n", - pd->requestid, - (unsigned long long)pd->aiocb->aio_nbytes, - (unsigned long long)pd->aiocb->aio_offset)); - - return 0; + num_busy += 1; + used = true; + return req; } -/************************************************************************ - Save off the error / success conditions from the io_event. - Is idempotent (can be called multiple times given the same ioev). -***********************************************************************/ - -static void aio_linux_setup_returns(struct io_event *ioev) -{ - struct aio_private_data *pd = (struct aio_private_data *)ioev->data; - - if (ioev->res < 0) { - pd->ret_size = -1; - pd->ret_errno = -ioev->res; - } else { - pd->ret_size = ioev->res; - pd->ret_errno = 0; - } -} - -/************************************************************************ - Handle a single finished io. -***********************************************************************/ - -static void aio_linux_handle_io_finished(struct io_event *ioev) -{ - struct aio_extra *aio_ex = NULL; - struct aio_private_data *pd = (struct aio_private_data *)ioev->data; - - aio_linux_setup_returns(ioev); - - aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr; - smbd_aio_complete_aio_ex(aio_ex); - - DEBUG(10,("aio_linux_handle_io_finished: requestid %d completed\n", - pd->requestid )); - TALLOC_FREE(aio_ex); -} - -/************************************************************************ - Callback when multiple IOs complete. -***********************************************************************/ - -static void aio_linux_handle_completion(struct event_context *event_ctx, - struct fd_event *event, - uint16 flags, - void *p) +static void aio_linux_done(struct event_context *event_ctx, + struct fd_event *event, + uint16 flags, void *private_data) { uint64_t num_events = 0; - DEBUG(10, ("aio_linux_handle_completion called with flags=%d\n", - (int)flags)); - - if ((flags & EVENT_FD_READ) == 0) { - return; - } + DEBUG(10, ("aio_linux_done called with flags=%d\n", + (int)flags)); /* Read the number of events available. */ if (sys_read(event_fd, &num_events, sizeof(num_events)) != @@ -341,382 +228,54 @@ static void aio_linux_handle_completion(struct event_context *event_ctx, } while (num_events > 0) { - uint64_t events_to_read = MIN(num_events, aio_pending_size); - struct timespec ts; - int i; + struct timespec ts = { 0, }; + struct io_event finished; + struct tevent_req *req; + struct aio_linux_state *state; int ret; - ts.tv_sec = 0; - ts.tv_nsec = 0; - - ret = io_getevents(io_ctx, - 1, - (long)events_to_read, - io_recv_events, - &ts); - + ret = io_getevents(io_ctx, 1, 1, &finished, &ts); if (ret < 0) { - errno = -ret; - DEBUG(1, ("aio_linux_handle_completion: " - "io_getevents error %s\n", - strerror(errno) )); + DEBUG(1, ("aio_linux_done: io_getevents returned %s\n", + strerror(-ret))); return; } - if (ret == 0) { - DEBUG(10, ("aio_linux_handle_completion: " - "io_getevents returned 0\n")); + DEBUG(10, ("aio_linux_done: io_getvents returned " + "0\n")); continue; } - /* ret is positive. */ - for (i = 0; i < ret; i++) { - aio_linux_handle_io_finished(&io_recv_events[i]); - } - - num_events -= ret; - } -} - -/************************************************************************ - Find the private data by aiocb. -***********************************************************************/ + num_busy -= 1; -static struct aio_private_data *find_private_data_by_aiocb(SMB_STRUCT_AIOCB *aiocb) -{ - struct aio_private_data *pd; + req = talloc_get_type_abort(finished.data, + struct tevent_req); + state = tevent_req_data(req, struct aio_linux_state); - for (pd = pd_list; pd != NULL; pd = pd->next) { - if (pd->aiocb == aiocb) { - return pd; + if (finished.res < 0) { + state->ret = -1; + state->err = -finished.res; + } else { + state->ret = finished.res; + state->err = 0; } + tevent_req_done(req); + num_events -= 1; } - - return NULL; } -/************************************************************************ - Called to return the result of a completed AIO. - Should only be called if aio_error returns something other than EINPROGRESS. - Returns: - Any other value - return from IO operation. -***********************************************************************/ - -static ssize_t aio_linux_return_fn(struct vfs_handle_struct *handle, - struct files_struct *fsp, - SMB_STRUCT_AIOCB *aiocb) +static ssize_t aio_linux_recv(struct tevent_req *req, int *err) { - struct aio_private_data *pd = find_private_data_by_aiocb(aiocb); - - if (pd == NULL) { - errno = EINVAL; - DEBUG(0, ("aio_linux_return_fn: returning EINVAL\n")); - return -1; - } + struct aio_linux_state *state = tevent_req_data( + req, struct aio_linux_state); - pd->aiocb = NULL; - - if (pd->cancelled) { - errno = ECANCELED; + if (tevent_req_is_unix_error(req, err)) { return -1; } - - if (pd->ret_size == -1) { - errno = pd->ret_errno; - } - - return pd->ret_size; -} - -/************************************************************************ - Called to check the result of an AIO. - Returns: - EINPROGRESS - still in progress. - EINVAL - invalid aiocb. - ECANCELED - request was cancelled. - 0 - request completed successfully. - Any other value - errno from IO operation. -***********************************************************************/ - -static int aio_linux_error_fn(struct vfs_handle_struct *handle, - struct files_struct *fsp, - SMB_STRUCT_AIOCB *aiocb) -{ - struct aio_private_data *pd = find_private_data_by_aiocb(aiocb); - - if (pd == NULL) { - return EINVAL; - } - if (pd->cancelled) { - return ECANCELED; - } - return pd->ret_errno; -} - -/************************************************************************ - Called to request the cancel of an AIO, or all of them on a specific - fsp if aiocb == NULL. -***********************************************************************/ - -static int aio_linux_cancel(struct vfs_handle_struct *handle, - struct files_struct *fsp, - SMB_STRUCT_AIOCB *aiocb) -{ - struct aio_private_data *pd = NULL; - - for (pd = pd_list; pd != NULL; pd = pd->next) { - if (pd->aiocb == NULL) { - continue; - } - if (pd->aiocb->aio_fildes != fsp->fh->fd) { - continue; - } - if ((aiocb != NULL) && (pd->aiocb != aiocb)) { - continue; - } - - /* - * We let the kernel do its job, but we discard the result when - * it's finished. NB. Should I call io_cancel here ? - */ - - pd->cancelled = true; - } - - return AIO_CANCELED; -} - -/************************************************************************ - Callback for a previously detected job completion deferred to the main - loop. -***********************************************************************/ - -static void aio_linux_handle_immediate(struct tevent_context *ctx, - struct tevent_immediate *im, - void *private_data) -{ - struct io_event *ioev = (struct io_event *)private_data; - - aio_linux_handle_io_finished(ioev); - TALLOC_FREE(ioev); -} - -/************************************************************************ - Private data struct used in suspend completion code. -***********************************************************************/ - -struct suspend_private { - int num_entries; - int num_finished; - const SMB_STRUCT_AIOCB * const *aiocb_array; -}; - -/************************************************************************ - Handle a single finished io from suspend. -***********************************************************************/ - -static void aio_linux_handle_suspend_io_finished(struct suspend_private *sp, - struct io_event *ioev) -{ - struct aio_private_data *pd = (struct aio_private_data *)ioev->data; - struct io_event *new_ioev = NULL; - struct tevent_immediate *im = NULL; - int i; - - /* Is this a requestid with an aiocb we're interested in ? */ - for (i = 0; i < sp->num_entries; i++) { - if (sp->aiocb_array[i] == pd->aiocb) { - sp->num_finished++; - /* - * We don't call aio_linux_handle_io_finished() - * here, but only the function that sets up the - * return values. This allows - * aio_linux_handle_io_finished() to be successfully - * called from smbd/aio.c:wait_for_aio_completion() - * once we return from here with all io's done. - */ - aio_linux_setup_returns(ioev); - return; - } - } - - /* Jobid completed we weren't waiting for. - We must reshedule this as an immediate event - on the main event context. */ - im = tevent_create_immediate(NULL); - if (!im) { - exit_server_cleanly("aio_linux_handle_suspend_completion: no memory"); - } - - new_ioev = (struct io_event *)talloc_memdup(NULL, - ioev, - sizeof(struct io_event)); - if (!new_ioev) { - exit_server_cleanly("aio_linux_handle_suspend_completion: no memory"); + if (state->ret == -1) { + *err = state->err; } - - DEBUG(10,("aio_linux_handle_suspend_completion: " - "re-scheduling requestid %d\n", - pd->requestid)); - - tevent_schedule_immediate(im, - server_event_context(), - aio_linux_handle_immediate, - (void *)new_ioev); -} - -/************************************************************************ - Callback when an IO completes from a suspend call. -***********************************************************************/ - -static void aio_linux_handle_suspend_completion(struct event_context *event_ctx, - struct fd_event *event, - uint16 flags, - void *p) -{ - struct suspend_private *sp = (struct suspend_private *)p; - uint64_t remaining_events = sp->num_entries - sp->num_finished; - uint64_t num_events = 0; - - DEBUG(10, ("aio_linux_handle_suspend_completion called with flags=%d\n", - (int)flags)); - - if ((flags & EVENT_FD_READ) == 0) { - return; - } - - /* Read the number of events available. */ - if (sys_read(event_fd, &num_events, sizeof(num_events)) != - sizeof(num_events)) { - smb_panic("aio_linux_handle_completion: invalid read"); - } - - while (num_events > 0) { - uint64_t events_to_read = MIN(num_events, remaining_events); - struct timespec ts; - int i; - int ret; - - ts.tv_sec = 0; - ts.tv_nsec = 0; - - ret = io_getevents(io_ctx, - 1, - (long)events_to_read, - io_recv_events, - &ts); - - if (ret < 0) { - errno = -ret; - DEBUG(1, ("aio_linux_handle_suspend_completion: " - "io_getevents error %s\n", - strerror(errno) )); - return; - } - - if (ret == 0) { - DEBUG(10, ("aio_linux_handle_suspend_completion: " - "io_getevents returned 0\n")); - continue; - } - - /* ret is positive. */ - for (i = 0; i < ret; i++) { - aio_linux_handle_suspend_io_finished(sp, - &io_recv_events[i]); - } - - num_events -= ret; - } -} - -static void aio_linux_suspend_timed_out(struct tevent_context *event_ctx, - struct tevent_timer *te, - struct timeval now, - void *private_data) -{ - bool *timed_out = (bool *)private_data; - /* Remove this timed event handler. */ - TALLOC_FREE(te); - *timed_out = true; -} - -/************************************************************************ - Called to request everything to stop until all IO is completed. -***********************************************************************/ - -static int aio_linux_suspend(struct vfs_handle_struct *handle, - struct files_struct *fsp, - const SMB_STRUCT_AIOCB * const aiocb_array[], - int n, - const struct timespec *timeout) -{ - struct event_context *ev = NULL; - struct fd_event *sock_event = NULL; - int ret = -1; - struct suspend_private sp; - bool timed_out = false; - TALLOC_CTX *frame = talloc_stackframe(); - - /* This is a blocking call, and has to use a sub-event loop. */ - ev = event_context_init(frame); - if (ev == NULL) { - errno = ENOMEM; - goto out; - } - - if (timeout) { - struct timeval tv = convert_timespec_to_timeval(*timeout); - struct tevent_timer *te = tevent_add_timer(ev, - frame, - timeval_current_ofs(tv.tv_sec, - tv.tv_usec), - aio_linux_suspend_timed_out, - &timed_out); - if (!te) { - errno = ENOMEM; - goto out; - } - } - - ZERO_STRUCT(sp); - sp.num_entries = n; - sp.aiocb_array = aiocb_array; - sp.num_finished = 0; - - sock_event = tevent_add_fd(ev, - frame, - event_fd, - TEVENT_FD_READ, - aio_linux_handle_suspend_completion, - (void *)&sp); - if (sock_event == NULL) { - goto out; - } - /* - * We're going to cheat here. We know that smbd/aio.c - * only calls this when it's waiting for every single - * outstanding call to finish on a close, so just wait - * individually for each IO to complete. We don't care - * what order they finish - only that they all do. JRA. - */ - while (sp.num_entries != sp.num_finished) { - if (tevent_loop_once(ev) == -1) { - goto out; - } - - if (timed_out) { - errno = EAGAIN; - goto out; - } - } - - ret = 0; - - out: - - TALLOC_FREE(frame); - return ret; + return state->ret; } static int aio_linux_connect(vfs_handle_struct *handle, const char *service, @@ -738,12 +297,10 @@ static int aio_linux_connect(vfs_handle_struct *handle, const char *service, static struct vfs_fn_pointers vfs_aio_linux_fns = { .connect_fn = aio_linux_connect, - .aio_read_fn = aio_linux_read, - .aio_write_fn = aio_linux_write, - .aio_return_fn = aio_linux_return_fn, - .aio_cancel_fn = aio_linux_cancel, - .aio_error_fn = aio_linux_error_fn, - .aio_suspend_fn = aio_linux_suspend, + .pread_send_fn = aio_linux_pread_send, + .pread_recv_fn = aio_linux_recv, + .pwrite_send_fn = aio_linux_pwrite_send, + .pwrite_recv_fn = aio_linux_recv, }; NTSTATUS vfs_aio_linux_init(void) |