From 16d2c2fa58c57539a9b540eb93825806caaea0b5 Mon Sep 17 00:00:00 2001 From: Tim Prouty Date: Tue, 27 Jan 2009 16:13:35 -0800 Subject: s3 OneFS: Add kernel oplocks implementation A few functions in oplocks_onefs.c need to be accessed from the onefs vfs module. It would be ideal if oplocks were implemented at the vfs layer, but since they aren't yet, a new header is added to source3/include to make these functions available to the onefs vfs module. oplocks_onefs.o doesn't need to be linked into the onefs vfs module explicitly, since it is already linked into smbd by default. --- source3/Makefile.in | 2 +- source3/configure.in | 2 + source3/include/oplock_onefs.h | 50 +++ source3/include/proto.h | 4 + source3/modules/onefs.h | 2 +- source3/modules/onefs_open.c | 155 ++++++-- source3/modules/onefs_system.c | 58 +-- source3/smbd/oplock.c | 7 + source3/smbd/oplock_onefs.c | 798 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 996 insertions(+), 82 deletions(-) create mode 100644 source3/include/oplock_onefs.h create mode 100644 source3/smbd/oplock_onefs.c diff --git a/source3/Makefile.in b/source3/Makefile.in index 942c5b33ae..2049953284 100644 --- a/source3/Makefile.in +++ b/source3/Makefile.in @@ -620,7 +620,7 @@ PROFILES_OBJ = utils/profiles.o \ $(LIB_OBJ) $(LIB_DUMMY_OBJ) \ $(POPT_LIB_OBJ) -OPLOCK_OBJ = smbd/oplock.o smbd/oplock_irix.o smbd/oplock_linux.o +OPLOCK_OBJ = smbd/oplock.o smbd/oplock_irix.o smbd/oplock_linux.o smbd/oplock_onefs.o NOTIFY_OBJ = smbd/notify.o smbd/notify_inotify.o smbd/notify_internal.o diff --git a/source3/configure.in b/source3/configure.in index b81e768073..10ce6f6e5e 100644 --- a/source3/configure.in +++ b/source3/configure.in @@ -1099,6 +1099,8 @@ if test x"$samba_cv_HAVE_ONEFS" = x"yes"; then AC_DEFINE(HAVE_ONEFS,1,[Whether building on Isilon OneFS]) default_shared_modules="$default_shared_modules vfs_onefs perfcount_onefs" ONEFS_LIBS="-lisi_acl" + # Need to also add general libs for oplocks support + save_LIBS="$save_LIBS -lisi_ecs -lisi_event -lisi_util -ldevstat" fi AC_SUBST(ONEFS_LIBS) LIBS="$save_LIBS" diff --git a/source3/include/oplock_onefs.h b/source3/include/oplock_onefs.h new file mode 100644 index 0000000000..a20becdf2c --- /dev/null +++ b/source3/include/oplock_onefs.h @@ -0,0 +1,50 @@ +/* + * Unix SMB/CIFS implementation. + * Support for OneFS kernel oplocks + * + * Copyright (C) Tim Prouty, 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _OPLOCK_ONEFS_H +#define _OPLOCK_ONEFS_H + +#if HAVE_ONEFS + +#include + +struct deferred_open_record { + bool delayed_for_oplocks; + bool failed; /* added for onefs_oplocks */ + struct file_id id; +}; + +/* + * OneFS oplock utility functions + */ +const char *onefs_oplock_str(enum oplock_type onefs_oplock_type); +int onefs_oplock_to_samba_oplock(enum oplock_type onefs_oplock); +enum oplock_type onefs_samba_oplock_to_oplock(int samba_oplock_type); + +/* + * OneFS oplock callback tracking + */ +void destroy_onefs_callback_record(uint64 id); +uint64 onefs_oplock_wait_record(uint16 mid); +void onefs_set_oplock_callback(uint64 id, files_struct *fsp); + +#endif /* HAVE_ONEFS */ + +#endif /* _OPLOCK_ONEFS_H */ diff --git a/source3/include/proto.h b/source3/include/proto.h index f553f7625e..1566a01dc5 100644 --- a/source3/include/proto.h +++ b/source3/include/proto.h @@ -6924,6 +6924,10 @@ int linux_set_lease_sighandler(int fd); int linux_setlease(int fd, int leasetype); struct kernel_oplocks *linux_init_kernel_oplocks(TALLOC_CTX *mem_ctx) ; +/* The following definitions come from smbd/oplock_onefs.c */ + +struct kernel_oplocks *onefs_init_kernel_oplocks(TALLOC_CTX *mem_ctx); + /* The following definitions come from smbd/password.c */ user_struct *get_valid_user_struct(uint16 vuid); diff --git a/source3/modules/onefs.h b/source3/modules/onefs.h index 2044ebec48..c8f19f4b31 100644 --- a/source3/modules/onefs.h +++ b/source3/modules/onefs.h @@ -22,7 +22,7 @@ #define _ONEFS_H #include "includes.h" - +#include "oplock_onefs.h" #include /* OneFS Module smb.conf parameters and defaults */ diff --git a/source3/modules/onefs_open.c b/source3/modules/onefs_open.c index c8415de521..b2b11ebaac 100644 --- a/source3/modules/onefs_open.c +++ b/source3/modules/onefs_open.c @@ -33,14 +33,12 @@ */ #include "onefs.h" +#include "smbd/globals.h" extern const struct generic_mapping file_generic_mapping; -extern bool global_client_failed_oplock_break; -struct deferred_open_record { - bool delayed_for_oplocks; - bool failed; /* added for onefs_oplocks */ - struct file_id id; +struct onefs_fsp_data { + uint64_t oplock_callback_id; }; static NTSTATUS onefs_create_file_unixpath(connection_struct *conn, @@ -55,9 +53,9 @@ static NTSTATUS onefs_create_file_unixpath(connection_struct *conn, uint64_t allocation_size, struct security_descriptor *sd, struct ea_list *ea_list, - files_struct **result, int *pinfo, + struct onefs_fsp_data *fsp_data, SMB_STRUCT_STAT *psbuf); /**************************************************************************** @@ -189,11 +187,6 @@ static NTSTATUS onefs_open_file(files_struct *fsp, flags |= O_NOFOLLOW; } #endif - /* Don't request an oplock if oplocks are turned off for the - * share. */ - if (!lp_oplocks(SNUM(conn))) - oplock_request = 0; - /* Stream handling */ if (is_ntfs_stream_name(path)) { status = onefs_split_ntfs_stream_name(talloc_tos(), path, @@ -203,6 +196,22 @@ static NTSTATUS onefs_open_file(files_struct *fsp, if (stream != NULL) { SMB_ASSERT(fsp->base_fsp); + /* + * We have never seen an oplock taken on a stream, and our + * current implementation doesn't support it. If a request is + * seen, log a loud error message and ignore the requested + * oplock. + */ + if ((oplock_request & ~SAMBA_PRIVATE_OPLOCK_MASK) != + NO_OPLOCK) { + DEBUG(0,("Oplock(%d) being requested on a stream! " + "Ignoring oplock request: base=%s, stream=%s", + oplock_request & ~SAMBA_PRIVATE_OPLOCK_MASK, + base, stream)); + /* Recover by requesting NO_OPLOCK instead. */ + oplock_request &= SAMBA_PRIVATE_OPLOCK_MASK; + } + DEBUG(10,("Opening a stream: base=%s(%d), stream=%s", base, fsp->base_fsp->fh->fd, stream)); @@ -242,8 +251,8 @@ static NTSTATUS onefs_open_file(files_struct *fsp, status = map_nt_error_from_unix(errno); DEBUG(3,("Error opening file %s (%s) (local_flags=%d) " - "(flags=%d)\n", - path,nt_errstr(status),local_flags,flags)); + "(flags=%d)\n", + path, strerror(errno), local_flags, flags)); return status; } @@ -407,7 +416,11 @@ static void schedule_defer_open(struct share_mode_lock *lck, * measure here in case the other smbd is stuck * somewhere else. */ - timeout = timeval_set(OPLOCK_BREAK_TIMEOUT*2, 0); + /* + * On OneFS, the kernel will always send an oplock_revoked message + * before this timeout is hit. + */ + timeout = timeval_set(OPLOCK_BREAK_TIMEOUT*10, 0); /* Nothing actually uses state.delayed_for_oplocks but it's handy to differentiate in debug messages @@ -415,7 +428,7 @@ static void schedule_defer_open(struct share_mode_lock *lck, a 1 second delay for share mode conflicts. */ state.delayed_for_oplocks = True; - state.failed = False; + state.failed = false; state.id = lck->id; if (!request_timed_out(request_time, timeout)) { @@ -438,6 +451,7 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, struct security_descriptor *sd, files_struct *fsp, int *pinfo, + struct onefs_fsp_data *fsp_data, SMB_STRUCT_STAT *psbuf) { int flags=0; @@ -461,7 +475,7 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, char *parent_dir; const char *newname; int granted_oplock; - uint64 oplock_waiter; + uint64_t oplock_callback_id = 0; uint32 createfile_attributes = 0; ZERO_STRUCT(id); @@ -505,6 +519,30 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, create_disposition, create_options, unx_mode, oplock_request)); + /* + * Any non-stat-only open has the potential to contend oplocks, which + * means to avoid blocking in the kernel (which is unacceptable), the + * open must be deferred. In order to defer opens, req must not be + * NULL. The known cases of calling with a NULL req: + * + * 1. Open the base file of a stream: Always done stat-only + * + * 2. Open the stream: Oplocks are disallowed on streams, so an + * oplock will never be contended. + * + * 3. open_file_fchmod(), which is called from 3 places: + * A. try_chown: Posix acls only. Never called on onefs. + * B. set_ea_dos_attributes: Can't be called from onefs, because + * SMB_VFS_SETXATTR return ENOSYS. + * C. file_set_dos_mode: This would only happen if the "dos + * filemode" smb.conf parameter is set to yes. We ship with + * it off, but if a customer were to turn it on it would be + * bad. + */ + if (req == NULL && !is_stat_open(access_mask) && !is_ntfs_stream_name(fname)) { + smb_panic("NULL req on a non-stat-open!"); + } + if ((req == NULL) && ((oplock_request & INTERNAL_OPEN_ONLY) == 0)) { DEBUG(0, ("No smb request but not an internal only open!\n")); return NT_STATUS_INTERNAL_ERROR; @@ -839,10 +877,22 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, (unsigned int)unx_mode, (unsigned int)access_mask, (unsigned int)open_access_mask)); - oplock_waiter = 1; //ifs_oplock_wait_record(mid); - - if (oplock_waiter == 0) { - return NT_STATUS_NO_MEMORY; + /* + * Since the open is guaranteed to be stat only if req == NULL, a + * callback record is only needed if req != NULL. + */ + if (req) { + SMB_ASSERT(fsp_data); + oplock_callback_id = onefs_oplock_wait_record(req->mid); + if (oplock_callback_id == 0) { + return NT_STATUS_NO_MEMORY; + } + } else { + /* + * It is also already asserted it's either a stream or a + * stat-only open at this point. + */ + SMB_ASSERT(fsp->oplock_type == NO_OPLOCK); } /* Do the open. */ @@ -858,7 +908,7 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, access_mask, open_access_mask, fsp->oplock_type, - oplock_waiter, + oplock_callback_id, share_access, create_options, createfile_attributes, @@ -910,6 +960,9 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, goto cleanup_destroy; } /* Waiting for an oplock */ + DEBUG(5,("Async createfile because a client has an " + "oplock on %s\n", fname)); + SMB_ASSERT(req); schedule_defer_open(lck, request_time, req); goto cleanup; @@ -1044,7 +1097,9 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, * Normal error, for example EACCES */ cleanup_destroy: - //destroy_ifs_callback_record(oplock_waiter); + if (oplock_callback_id != 0) { + destroy_onefs_callback_record(oplock_callback_id); + } cleanup: TALLOC_FREE(lck); return status; @@ -1052,9 +1107,12 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, fsp->oplock_type = granted_oplock; - /* XXX uncomment for oplocks */ - //ifs_set_oplock_callback(oplock_waiter, fsp); - //fsp->oplock_callback_id = oplock_waiter; + if (oplock_callback_id != 0) { + onefs_set_oplock_callback(oplock_callback_id, fsp); + fsp_data->oplock_callback_id = oplock_callback_id; + } else { + SMB_ASSERT(fsp->oplock_type == NO_OPLOCK); + } if (!file_existed) { struct timespec old_write_time = get_mtimespec(psbuf); @@ -1195,6 +1253,16 @@ NTSTATUS onefs_open_file_ntcreate(connection_struct *conn, } } + if (fsp->oplock_type == LEVEL_II_OPLOCK && + (!lp_level2_oplocks(SNUM(conn)) || + !(global_client_caps & CAP_LEVEL_II_OPLOCKS))) { + + DEBUG(5, ("Downgrading level2 oplock on open " + "because level2 oplocks = off\n")); + + release_file_oplock(fsp); + } + if (info == FILE_WAS_OVERWRITTEN || info == FILE_WAS_CREATED || info == FILE_WAS_SUPERSEDED) { new_file_created = True; @@ -1654,6 +1722,7 @@ static NTSTATUS open_streams_for_delete(connection_struct *conn, NULL, /* ea_list */ &streams[i], /* result */ NULL, /* pinfo */ + NULL, /* fsp_data */ NULL); /* psbuf */ TALLOC_FREE(streamname); @@ -1701,6 +1770,7 @@ static NTSTATUS onefs_create_file_unixpath(connection_struct *conn, struct ea_list *ea_list, files_struct **result, int *pinfo, + struct onefs_fsp_data *fsp_data, SMB_STRUCT_STAT *psbuf) { SMB_STRUCT_STAT sbuf; @@ -1733,6 +1803,8 @@ static NTSTATUS onefs_create_file_unixpath(connection_struct *conn, } if (req == NULL) { + SMB_ASSERT((oplock_request & ~SAMBA_PRIVATE_OPLOCK_MASK) == + NO_OPLOCK); oplock_request |= INTERNAL_OPEN_ONLY; } @@ -1793,7 +1865,7 @@ static NTSTATUS onefs_create_file_unixpath(connection_struct *conn, conn, /* conn */ NULL, /* req */ base, /* fname */ - 0, /* access_mask */ + SYNCHRONIZE_ACCESS, /* access_mask */ (FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE), /* share_access */ @@ -1806,6 +1878,7 @@ static NTSTATUS onefs_create_file_unixpath(connection_struct *conn, NULL, /* ea_list */ &base_fsp, /* result */ NULL, /* pinfo */ + NULL, /* fsp_data */ NULL); /* psbuf */ if (!NT_STATUS_IS_OK(status)) { @@ -1890,6 +1963,7 @@ static NTSTATUS onefs_create_file_unixpath(connection_struct *conn, sd, /* sd */ fsp, /* result */ &info, /* pinfo */ + fsp_data, /* fsp_data */ &sbuf); /* psbuf */ if(!NT_STATUS_IS_OK(status)) { @@ -2013,6 +2087,13 @@ static NTSTATUS onefs_create_file_unixpath(connection_struct *conn, return status; } +static void destroy_onefs_fsp_data(void *p_data) +{ + struct onefs_fsp_data *fsp_data = (struct onefs_fsp_data *)p_data; + + destroy_onefs_callback_record(fsp_data->oplock_callback_id); +} + /** * SMB_VFS_CREATE_FILE interface to onefs. */ @@ -2036,6 +2117,7 @@ NTSTATUS onefs_create_file(vfs_handle_struct *handle, { connection_struct *conn = handle->conn; struct case_semantics_state *case_state = NULL; + struct onefs_fsp_data fsp_data = {}; SMB_STRUCT_STAT sbuf; int info = FILE_WAS_OPENED; files_struct *fsp = NULL; @@ -2139,6 +2221,7 @@ NTSTATUS onefs_create_file(vfs_handle_struct *handle, ea_list, /* ea_list */ &fsp, /* result */ &info, /* pinfo */ + &fsp_data, /* fsp_data */ &sbuf); /* psbuf */ if (!NT_STATUS_IS_OK(status)) { @@ -2147,6 +2230,26 @@ NTSTATUS onefs_create_file(vfs_handle_struct *handle, DEBUG(10, ("onefs_create_file: info=%d\n", info)); + /* + * Setup private onefs_fsp_data. Currently the private data struct is + * only used to store the oplock_callback_id so that when the file is + * closed, the onefs_callback_record can be properly cleaned up in the + * oplock_onefs sub-system. + */ + if (fsp) { + struct onefs_fsp_data *fsp_data_tmp = NULL; + fsp_data_tmp = (struct onefs_fsp_data *) + VFS_ADD_FSP_EXTENSION(handle, fsp, struct onefs_fsp_data, + &destroy_onefs_fsp_data); + + if (fsp_data_tmp == NULL) { + status = NT_STATUS_NO_MEMORY; + goto fail; + } + + *fsp_data_tmp = fsp_data; + } + *result = fsp; if (pinfo != NULL) { *pinfo = info; diff --git a/source3/modules/onefs_system.c b/source3/modules/onefs_system.c index 4ebdf12a50..acc38fba30 100644 --- a/source3/modules/onefs_system.c +++ b/source3/modules/onefs_system.c @@ -68,58 +68,6 @@ static void smlock_dump(int debuglevel, const struct sm_lock *sml) (int)sml->sm_timeout.tv_usec)); } -/* - * Return string value of onefs oplock types. - */ -static const char *onefs_oplock_str(enum oplock_type onefs_oplock_type) -{ - switch (onefs_oplock_type) { - case OPLOCK_NONE: - return "OPLOCK_NONE"; - case OPLOCK_EXCLUSIVE: - return "OPLOCK_EXCLUSIVE"; - case OPLOCK_BATCH: - return "OPLOCK_BATCH"; - case OPLOCK_SHARED: - return "OPLOCK_SHARED"; - default: - break; - } - return "UNKNOWN"; -} - -/* - * Convert from onefs to samba oplock. - */ -static int onefs_oplock_to_samba_oplock(enum oplock_type onefs_oplock) -{ - switch (onefs_oplock) { - case OPLOCK_NONE: - return NO_OPLOCK; - case OPLOCK_EXCLUSIVE: - return EXCLUSIVE_OPLOCK; - case OPLOCK_BATCH: - return BATCH_OPLOCK; - case OPLOCK_SHARED: - return LEVEL_II_OPLOCK; - default: - DEBUG(0, ("unknown oplock type %d found\n", onefs_oplock)); - break; - } - return NO_OPLOCK; -} - -/* - * Convert from samba to onefs oplock. - */ -static enum oplock_type onefs_samba_oplock_to_oplock(int samba_oplock_type) -{ - if (BATCH_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_BATCH; - if (EXCLUSIVE_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_EXCLUSIVE; - if (LEVEL_II_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_SHARED; - return OPLOCK_NONE; -} - /** * External interface to ifs_createfile */ @@ -164,10 +112,12 @@ int onefs_sys_create_file(connection_struct *conn, pifs_sd = &ifs_sd; } + /* Stripping off private bits will be done for us. */ onefs_oplock = onefs_samba_oplock_to_oplock(oplock_request); - /* Temporary until oplock work is added to vfs_onefs */ - onefs_oplock = OPLOCK_NONE; + if (!lp_oplocks(SNUM(conn))) { + SMB_ASSERT(onefs_oplock == OPLOCK_NONE); + } /* Convert samba dos flags to UF_DOS_* attributes. */ onefs_dos_attributes = dos_attributes_to_stat_dos_flags(dos_flags); diff --git a/source3/smbd/oplock.c b/source3/smbd/oplock.c index a6ec9cfa2d..b39e5bf634 100644 --- a/source3/smbd/oplock.c +++ b/source3/smbd/oplock.c @@ -122,6 +122,11 @@ void release_file_oplock(files_struct *fsp) static void downgrade_file_oplock(files_struct *fsp) { + if (!EXCLUSIVE_OPLOCK_TYPE(fsp->oplock_type)) { + DEBUG(0, ("trying to downgrade an already-downgraded oplock!\n")); + return; + } + if (koplocks) { koplocks->ops->release_oplock(koplocks, fsp, LEVEL_II_OPLOCK); } @@ -916,6 +921,8 @@ bool init_oplocks(struct messaging_context *msg_ctx) koplocks = irix_init_kernel_oplocks(talloc_autofree_context()); #elif HAVE_KERNEL_OPLOCKS_LINUX koplocks = linux_init_kernel_oplocks(talloc_autofree_context()); +#elif HAVE_ONEFS + koplocks = onefs_init_kernel_oplocks(talloc_autofree_context()); #endif } diff --git a/source3/smbd/oplock_onefs.c b/source3/smbd/oplock_onefs.c new file mode 100644 index 0000000000..0908ce4386 --- /dev/null +++ b/source3/smbd/oplock_onefs.c @@ -0,0 +1,798 @@ +/* + * Unix SMB/CIFS implementation. + * Support for OneFS kernel oplocks + * + * Copyright (C) Volker Lendecke 2007 + * Copyright (C) Tim Prouty, 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#define DBGC_CLASS DBGC_LOCKING + +#include "includes.h" + +#if HAVE_ONEFS +#include "oplock_onefs.h" +#include "smbd/globals.h" + +#include +#include +#include + +struct onefs_oplocks_context { + struct kernel_oplocks *ctx; + const struct oplocks_event_ops *onefs_ops; + int onefs_event_fd; + struct fd_event *read_fde; +}; + +enum onefs_callback_state { + ONEFS_OPEN_FILE, + ONEFS_WAITING_FOR_OPLOCK +}; + +struct onefs_callback_record { + struct onefs_callback_record *prev, *next; + uint64_t id; + enum onefs_callback_state state; + union { + files_struct *fsp; /* ONEFS_OPEN_FILE */ + uint16_t mid; /* ONEFS_WAITING_FOR_OPLOCK */ + } data; +}; + +/** + * Internal list of files (along with additional state) that have outstanding + * oplocks or requests for oplocks. + */ +struct onefs_callback_record *callback_recs; + +/** + * Convert a onefs_callback_record to a string. + */ +static char *onefs_callback_record_str_static(const struct onefs_callback_record *r) +{ + static fstring result; + + if (r == NULL) { + fstrcpy(result, "NULL callback record"); + return result; + } + + switch (r->state) { + case ONEFS_OPEN_FILE: + fstr_sprintf(result, "cb record %llu for file %s", + r->id, r->data.fsp->fsp_name); + break; + case ONEFS_WAITING_FOR_OPLOCK: + fstr_sprintf(result, "cb record %llu for pending mid %d", + r->id, (int)r->data.mid); + break; + default: + fstr_sprintf(result, "cb record %llu unknown state %d", + r->id, r->state); + break; + } + + return result; +} + +/** + * Traverse the list of onefs_callback_records and print all entries. + */ +static void debug_cb_records(const char *fn) +{ + struct onefs_callback_record *rec; + + if (DEBUGLEVEL < 10) + return; + + DEBUG(10, ("cb records (%s):\n", fn)); + + for (rec = callback_recs; rec; rec = rec->next) { + DEBUGADD(10, ("%s\n", onefs_callback_record_str_static(rec))); + } +} + +/** + * Find a callback record in the list of outstanding oplock operations. + * + * Once n ifs_createfile requests an oplock on a file, the kernel communicates + * with samba via the oplock event channel by sending events that reference an + * id. This function maps that id to the onefs_callback_record that was + * created for it during the initial setup on open (onefs_oplock_wait_record). + * When a matching id is found in the onefs_callback_record list, the + * callback_type is checked to make sure the record is in in the correct + * state. + */ +static struct onefs_callback_record *onefs_find_cb(uint64_t id, + enum onefs_callback_state expected_state) +{ + struct onefs_callback_record *rec; + + debug_cb_records("onefs_find_cb"); + + for (rec = callback_recs; rec; rec = rec->next) { + if (rec->id == id) { + DEBUG(10, ("found %s\n", + onefs_callback_record_str_static(rec))); + break; + } + } + + if (rec == NULL) { + DEBUG(5, ("Could not find callback record for id %llu\n", id)); + return NULL; + } + + if (rec->state != expected_state) { + DEBUG(0, ("Expected cb type %d, got %s", expected_state, + onefs_callback_record_str_static(rec))); + SMB_ASSERT(0); + return NULL; + } + + return rec; +} + +/** + * Remove and free a callback record from the callback record list. + */ +void destroy_onefs_callback_record(uint64_t id) +{ + struct onefs_callback_record *rec; + + debug_cb_records("destroy_onefs_callback_record"); + + if (id == 0) { + DEBUG(10, ("destroy_onefs_callback_record: Nothing to " + "destroy\n")); + return; + } + + for (rec = callback_recs; rec; rec = rec->next) { + if (rec->id == id) { + DLIST_REMOVE(callback_recs, rec); + SAFE_FREE(rec); + DEBUG(10, ("removed cb rec %llu\n", id)); + return; + } + } + + DEBUG(0, ("Could not find cb rec %llu to delete", id)); + SMB_ASSERT(0); +} + +/** + * Initialize a callback record and add it to the list of outstanding callback + * records. + * + * This is called in the open path before ifs_createfile so an id can be + * passed in. Each callback record can be in one of two states: + * + * 1. WAITING_FOR_OPLOCK: This is the initial state for all callback + * records. If ifs_createfile can be completed syncronously without needing + * to break any level I oplocks, the state is transitioned to OPEN_FILE. + * Otherwise ifs_createfile will finish asynchronously and the open is + * deferred. When the necessary level I opocks have been broken, and the + * open can be done, an event is sent by the kernel on the oplock event + * channel, which is handled by semlock_available_handler. At this point + * the deferred open is retried. Unless a level I oplock was acquired by + * another client, ifs_createfile will now complete synchronously. + * + * 2. OPEN_FILE: Once ifs_createfile completes, the callback record is + * transitioned to this state via onefs_set_oplock_callback. + */ +uint64_t onefs_oplock_wait_record(uint16_t mid) +{ + struct onefs_callback_record *result; + static uint64_t id_generator = 0; + + if (!(result = SMB_MALLOC_P(struct onefs_callback_record))) { + DEBUG(0, ("talloc failed\n")); + return 0; + } + + memset(result, '\0', sizeof(result)); + + id_generator += 1; + if (id_generator == 0) { + /* Wow, that's a long-running smbd... */ + id_generator += 1; + } + + result->id = id_generator; + + result->state = ONEFS_WAITING_FOR_OPLOCK; + result->data.mid = mid; + DLIST_ADD(callback_recs, result); + + DEBUG(10, ("New cb rec %llu created\n", result->id)); + + return result->id; +} + +/** + * Transition the callback record state to OPEN_FILE. + * + * This is called after the file is opened and an fsp struct has been + * allocated. The mid is dropped in favor of storing the fsp. + */ +void onefs_set_oplock_callback(uint64_t id, files_struct *fsp) +{ + struct onefs_callback_record *cb; + char *msg; + + DEBUG(10, ("onefs_set_oplock_callback called for cb rec %llu\n", id)); + + if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) { + if (asprintf(&msg, "Got invalid callback %lld\n", id) != -1) { + smb_panic(msg); + } + smb_panic("Got invalid callback id\n"); + } + + /* + * Paranoia check + */ + if (open_was_deferred(cb->data.mid)) { + if (asprintf(&msg, "Trying to upgrade callback for deferred " + "open mid=%d\n", cb->data.mid) != -1) { + smb_panic(msg); + } + smb_panic("Trying to upgrade callback for deferred open " + "mid\n"); + } + + cb->state = ONEFS_OPEN_FILE; + cb->data.fsp = fsp; +} + +/** + * Using a callback record, initialize a share mode entry to pass to + * share_mode_entry_to_message to send samba IPC messages. + */ +static void init_share_mode_entry(struct share_mode_entry *sme, + struct onefs_callback_record *cb, + int op_type) +{ + ZERO_STRUCT(*sme); + + sme->pid = procid_self(); + sme->op_type = op_type; + sme->id = cb->data.fsp->file_id; + sme->share_file_id = cb->data.fsp->fh->gen_id; +} + +/** + * Callback when a break-to-none event is received from the kernel. + * + * On OneFS level 1 oplocks are always broken to level 2 first, therefore an + * async level 2 break message is always sent when breaking to none. The + * downside of this is that OneFS currently has no way to express breaking + * directly from level 1 to none. + */ +static void oplock_break_to_none_handler(uint64_t id) +{ + struct onefs_callback_record *cb; + struct share_mode_entry sme; + char msg[MSG_SMB_SHARE_MODE_ENTRY_SIZE]; + + DEBUG(10, ("oplock_break_to_none_handler called for id %llu\n", id)); + + if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) { + DEBUG(3, ("oplock_break_to_none_handler: could not find " + "callback id %llu\n", id)); + return; + } + + DEBUG(10, ("oplock_break_to_none_handler called for file %s\n", + cb->data.fsp->fsp_name)); + + init_share_mode_entry(&sme, cb, FORCE_OPLOCK_BREAK_TO_NONE); + share_mode_entry_to_message(msg, &sme); + messaging_send_buf(smbd_messaging_context(), + sme.pid, + MSG_SMB_ASYNC_LEVEL2_BREAK, + (uint8_t *)msg, + MSG_SMB_SHARE_MODE_ENTRY_SIZE); + + /* + * We could still receive an OPLOCK_REVOKED message, so keep the + * oplock_callback_id around. + */ +} + +/** + * Callback when a break-to-level2 event is received from the kernel. + * + * Breaks from level 1 to level 2. + */ +static void oplock_break_to_level_two_handler(uint64_t id) +{ + struct onefs_callback_record *cb; + struct share_mode_entry sme; + char msg[MSG_SMB_SHARE_MODE_ENTRY_SIZE]; + + DEBUG(10, ("oplock_break_to_level_two_handler called for id %llu\n", + id)); + + if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) { + DEBUG(3, ("oplock_break_to_level_two_handler: could not find " + "callback id %llu\n", id)); + return; + } + + DEBUG(10, ("oplock_break_to_level_two_handler called for file %s\n", + cb->data.fsp->fsp_name)); + + init_share_mode_entry(&sme, cb, LEVEL_II_OPLOCK); + share_mode_entry_to_message(msg, &sme); + messaging_send_buf(smbd_messaging_context(), + sme.pid, + MSG_SMB_BREAK_REQUEST, + (uint8_t *)msg, + MSG_SMB_SHARE_MODE_ENTRY_SIZE); + + /* + * We could still receive an OPLOCK_REVOKED or OPLOCK_BREAK_TO_NONE + * message, so keep the oplock_callback_id around. + */ +} + +/** + * Revoke an oplock from an unresponsive client. + * + * The kernel will send this message when it times out waiting for a level 1 + * oplock break to be acknowledged by the client. The oplock is then + * immediately removed. + */ +static void oplock_revoked_handler(uint64_t id) +{ + struct onefs_callback_record *cb; + files_struct *fsp = NULL; + + DEBUG(10, ("oplock_revoked_handler called for id %llu\n", id)); + + if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) { + DEBUG(3, ("oplock_revoked_handler: could not find " + "callback id %llu\n", id)); + return; + } + + fsp = cb->data.fsp; + + SMB_ASSERT(fsp->oplock_timeout == NULL); + + DEBUG(0,("Level 1 oplock break failed for file %s. Forcefully " + "revoking oplock\n", fsp->fsp_name)); + + global_client_failed_oplock_break = True; + remove_oplock(fsp); + + /* + * cb record is cleaned up in fsp ext data destructor on close, so + * leave it in the list. + */ +} + +/** + * Asynchronous ifs_createfile callback + * + * If ifs_createfile had to asynchronously break any oplocks, this function is + * called when the kernel sends an event that the open can be retried. + */ +static void semlock_available_handler(uint64_t id) +{ + struct onefs_callback_record *cb; + + DEBUG(10, ("semlock_available_handler called: %llu\n", id)); + + if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) { + DEBUG(5, ("semlock_available_handler: Did not find callback " + "%llu\n", id)); + return; + } + + DEBUG(10, ("Got semlock available for mid %d\n", cb->data.mid)); + + /* Paranoia check */ + if (!(open_was_deferred(cb->data.mid))) { + char *msg; + if (asprintf(&msg, "Semlock available on an open that wasn't " + "deferred: %s\n", + onefs_callback_record_str_static(cb)) != -1) { + smb_panic(msg); + } + smb_panic("Semlock available on an open that wasn't " + "deferred\n"); + } + + schedule_deferred_open_smb_message(cb->data.mid); + + /* Cleanup the callback record since the open will be retried. */ + destroy_onefs_callback_record(id); + + return; +} + +/** + * Asynchronous ifs_createfile failure callback + * + * If ifs_createfile had to asynchronously break any oplocks, but an error was + * encountered in the kernel, the open will be retried with the state->failed + * set to true. This will prompt the open path to send an INTERNAL_ERROR + * error message to the client. + */ +static void semlock_async_failure_handler(uint64_t id) +{ + struct onefs_callback_record *cb; + struct pending_message_list *pml; + struct deferred_open_record *state; + + DEBUG(1, ("semlock_async_failure_handler called: %llu\n", id)); + + if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) { + DEBUG(5, ("semlock_async_failure_handler: Did not find callback " + "%llu\n", id)); + return; + } + + DEBUG(1, ("Got semlock_async_failure message for mid %d\n", cb->data.mid)); + + /* Paranoia check */ + if (!(open_was_deferred(cb->data.mid))) { + char *msg; + if (asprintf(&msg, "Semlock failure on an open that wasn't " + "deferred: %s\n", + onefs_callback_record_str_static(cb)) != -1) { + smb_panic(msg); + } + smb_panic("Semlock failure on an open that wasn't deferred\n"); + } + + /* Find the actual deferred open record. */ + if (!(pml = get_open_deferred_message(cb->data.mid))) { + DEBUG(0, ("Could not find deferred request for " + "mid %d\n", cb->data.mid)); + destroy_onefs_callback_record(id); + return; + } + state = (struct deferred_open_record *)pml->private_data.data; + + /* Update to failed so the client can be notified on retried open. */ + state->failed = true; + + /* Schedule deferred open for immediate retry. */ + schedule_deferred_open_smb_message(cb->data.mid); + + /* Cleanup the callback record here since the open will be retried. */ + destroy_onefs_callback_record(id); + + return; +} + +/** + * OneFS acquires all oplocks via ifs_createfile, so this is a no-op. + */ +static bool onefs_set_kernel_oplock(struct kernel_oplocks *_ctx, + files_struct *fsp, int oplock_type) { + return true; +} + +/** + * Release the kernel oplock. + */ +static void onefs_release_kernel_oplock(struct kernel_oplocks *_ctx, + files_struct *fsp, int oplock_type) +{ + enum oplock_type oplock = onefs_samba_oplock_to_oplock(oplock_type); + + DEBUG(10, ("onefs_release_kernel_oplock: Releasing %s to type %s\n", + fsp->fsp_name, onefs_oplock_str(oplock))); + + if (fsp->fh->fd == -1) { + DEBUG(1, ("no fd\n")); + return; + } + + /* Downgrade oplock to either SHARED or NONE. */ + if (ifs_oplock_downgrade(fsp->fh->fd, oplock)) { + DEBUG(1,("ifs_oplock_downgrade failed: %s\n", + strerror(errno))); + } +} + +/** + * Wrap ifs_semlock_write so it is only called on operations that aren't + * already contended in the kernel. + */ +static void onefs_semlock_write(int fd, enum level2_contention_type type, + enum semlock_operation semlock_op) +{ + int ret; + + switch (type) { + case LEVEL2_CONTEND_ALLOC_GROW: + case LEVEL2_CONTEND_WINDOWS_BRL: + case LEVEL2_CONTEND_POSIX_BRL: + DEBUG(10, ("Taking %d write semlock for cmd %d on fd: %d\n", + semlock_op, type, fd)); + ret = ifs_semlock_write(fd, semlock_op); + if (ret) { + DEBUG(0,("ifs_semlock_write failed taking %d write " + "semlock for cmd %d on fd: %d: %s", + semlock_op, type, fd, strerror(errno))); + } + break; + default: + DEBUG(10, ("Skipping write semlock for cmd %d on fd: %d\n", + type, fd)); + } +} + +/** + * Contend level 2 oplocks in the kernel and smbd. + * + * Taking a write semlock will contend all level 2 oplocks in all smbds across + * the cluster except the fsp's own level 2 oplock. This lack of + * self-contention is a limitation of the current OneFS kernel oplocks + * implementation. Luckily it is easy to contend our own level 2 oplock by + * iterating the share mode entries and only breaking the oplock if the pid + * matches our's. + */ +static void onefs_contend_level2_oplocks_begin(files_struct *fsp, + enum level2_contention_type type) +{ + int i; + struct share_mode_lock *lck; + + /* Take care of level 2 kernel contention. */ + onefs_semlock_write(fsp->fh->fd, type, SEMLOCK_LOCK); + + /* + * If this file is level II oplocked then we need + * to grab the shared memory lock and inform all + * other files with a level II lock that they need + * to flush their read caches. We keep the lock over + * the shared memory area whilst doing this. + */ + + if (!LEVEL_II_OPLOCK_TYPE(fsp->oplock_type)) + return; + + lck = get_share_mode_lock(talloc_tos(), fsp->file_id, NULL, NULL, + NULL); + if (lck == NULL) { + DEBUG(0,("onefs_contend_level2_oplocks_begin: failed to lock " + "share mode entry for file %s.\n", fsp->fsp_name )); + return; + } + + DEBUG(10,("onefs_contend_level2_oplocks_begin: num_share_modes = %d\n", + lck->num_share_modes )); + + for(i = 0; i < lck->num_share_modes; i++) { + struct share_mode_entry *share_entry = &lck->share_modes[i]; + char msg[MSG_SMB_SHARE_MODE_ENTRY_SIZE]; + + if (!is_valid_share_mode_entry(share_entry)) { + continue; + } + + DEBUG(10,("onefs_contend_level2_oplocks_begin: " + "share_entry[%i]->op_type == %d\n", + i, share_entry->op_type )); + + if (share_entry->op_type == NO_OPLOCK) { + continue; + } + + /* Paranoia .... */ + if (EXCLUSIVE_OPLOCK_TYPE(share_entry->op_type)) { + DEBUG(0,("onefs_contend_level2_oplocks_begin: PANIC. " + "share mode entry %d is an exlusive " + "oplock !\n", i )); + TALLOC_FREE(lck); + abort(); + } + + share_mode_entry_to_message(msg, share_entry); + + /* + * Only contend our own level 2 oplock. The other processes + * will be get break events from the kernel. + */ + if (procid_is_me(&share_entry->pid)) { + DATA_BLOB blob = data_blob_const(msg, + MSG_SMB_SHARE_MODE_ENTRY_SIZE); + process_oplock_async_level2_break_message( + smbd_messaging_context(), + NULL, + MSG_SMB_ASYNC_LEVEL2_BREAK, + share_entry->pid, + &blob); + } + } + + /* We let the message receivers handle removing the oplock state + in the share mode lock db. */ + + TALLOC_FREE(lck); +} + +/** + * Unlock the write semlock when the level 2 contending operation ends. + */ +static void onefs_contend_level2_oplocks_end(files_struct *fsp, + enum level2_contention_type type) +{ + /* Take care of level 2 kernel contention. */ + onefs_semlock_write(fsp->fh->fd, type, SEMLOCK_UNLOCK); +} + +/** + * Return string value of onefs oplock types. + */ +const char *onefs_oplock_str(enum oplock_type onefs_oplock_type) +{ + switch (onefs_oplock_type) { + case OPLOCK_NONE: + return "OPLOCK_NONE"; + case OPLOCK_EXCLUSIVE: + return "OPLOCK_EXCLUSIVE"; + case OPLOCK_BATCH: + return "OPLOCK_BATCH"; + case OPLOCK_SHARED: + return "OPLOCK_SHARED"; + default: + break; + } + return "UNKNOWN"; +} + +/** + * Convert from onefs to samba oplock. + */ +int onefs_oplock_to_samba_oplock(enum oplock_type onefs_oplock) +{ + switch (onefs_oplock) { + case OPLOCK_NONE: + return NO_OPLOCK; + case OPLOCK_EXCLUSIVE: + return EXCLUSIVE_OPLOCK; + case OPLOCK_BATCH: + return BATCH_OPLOCK; + case OPLOCK_SHARED: + return LEVEL_II_OPLOCK; + default: + DEBUG(0, ("unknown oplock type %d found\n", onefs_oplock)); + break; + } + return NO_OPLOCK; +} + +/** + * Convert from samba to onefs oplock. + */ +enum oplock_type onefs_samba_oplock_to_oplock(int samba_oplock_type) +{ + if (BATCH_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_BATCH; + if (EXCLUSIVE_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_EXCLUSIVE; + if (LEVEL_II_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_SHARED; + return OPLOCK_NONE; +} + +/** + * Oplock event handler. + * + * Call into the event system dispatcher to handle each event. + */ +static void onefs_oplocks_read_fde_handler(struct event_context *ev, + struct fd_event *fde, + uint16_t flags, + void *private_data) +{ + struct onefs_oplocks_context *ctx = + talloc_get_type(private_data, struct onefs_oplocks_context); + + if (oplocks_event_dispatcher(ctx->onefs_ops)) { + DEBUG(0, ("oplocks_event_dispatcher failed: %s\n", + strerror(errno))); + } +} + +/** + * Setup kernel oplocks + */ +static const struct kernel_oplocks_ops onefs_koplocks_ops = { + .set_oplock = onefs_set_kernel_oplock, + .release_oplock = onefs_release_kernel_oplock, + .contend_level2_oplocks_begin = onefs_contend_level2_oplocks_begin, + .contend_level2_oplocks_end = onefs_contend_level2_oplocks_end, +}; + +static const struct oplocks_event_ops onefs_dispatch_ops = { + .oplock_break_to_none = oplock_break_to_none_handler, + .oplock_break_to_level_two = oplock_break_to_level_two_handler, + .oplock_revoked = oplock_revoked_handler, + .semlock_available = semlock_available_handler, + .semlock_async_failure = semlock_async_failure_handler, +}; + +struct kernel_oplocks *onefs_init_kernel_oplocks(TALLOC_CTX *mem_ctx) +{ + struct kernel_oplocks *_ctx = NULL; + struct onefs_oplocks_context *ctx = NULL; + struct procoptions po = PROCOPTIONS_INIT; + + DEBUG(10, ("onefs_init_kernel_oplocks called\n")); + + /* Set the non-blocking proc flag */ + po.po_flags_on |= P_NON_BLOCKING_SEMLOCK; + if (setprocoptions(&po) != 0) { + DEBUG(0, ("setprocoptions failed: %s.\n", strerror(errno))); + goto err_out; + } + + /* Setup the oplock contexts */ + _ctx = talloc_zero(mem_ctx, struct kernel_oplocks); + if (!_ctx) { + goto err_out; + } + + ctx = talloc_zero(_ctx, struct onefs_oplocks_context); + if (!ctx) { + goto err_out; + } + + _ctx->ops = &onefs_koplocks_ops; + _ctx->flags = (KOPLOCKS_LEVEL2_SUPPORTED | + KOPLOCKS_DEFERRED_OPEN_NOTIFICATION | + KOPLOCKS_TIMEOUT_NOTIFICATION | + KOPLOCKS_OPLOCK_BROKEN_NOTIFICATION); + _ctx->private_data = ctx; + ctx->ctx = _ctx; + ctx->onefs_ops = &onefs_dispatch_ops; + + /* Register an kernel event channel for oplocks */ + ctx->onefs_event_fd = oplocks_event_register(); + if (ctx->onefs_event_fd == -1) { + DEBUG(0, ("oplocks_event_register failed: %s\n", + strerror(errno))); + goto err_out; + } + + DEBUG(10, ("oplock event_fd = %d\n", ctx->onefs_event_fd)); + + /* Register the oplock event_fd with samba's event system */ + ctx->read_fde = event_add_fd(smbd_event_context(), + ctx, + ctx->onefs_event_fd, + EVENT_FD_READ, + onefs_oplocks_read_fde_handler, + ctx); + return _ctx; + + err_out: + talloc_free(_ctx); + talloc_free(ctx); + return NULL; +} + +#else + void oplock_onefs_dummy(void); + void oplock_onefs_dummy(void) {} +#endif /* HAVE_ONEFS */ -- cgit