/*
* Unix SMB/CIFS implementation.
* Support for OneFS kernel oplocks
*
* Copyright (C) Volker Lendecke 2007
* Copyright (C) Tim Prouty, 2009
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see .
*/
#define DBGC_CLASS DBGC_LOCKING
#include "includes.h"
#if HAVE_ONEFS
#include "oplock_onefs.h"
#include "smbd/globals.h"
#include
#include
#include
struct onefs_oplocks_context {
struct kernel_oplocks *ctx;
const struct oplocks_event_ops *onefs_ops;
int onefs_event_fd;
struct fd_event *read_fde;
};
enum onefs_callback_state {
ONEFS_OPEN_FILE,
ONEFS_WAITING_FOR_OPLOCK
};
struct onefs_callback_record {
struct onefs_callback_record *prev, *next;
uint64_t id;
enum onefs_callback_state state;
union {
files_struct *fsp; /* ONEFS_OPEN_FILE */
uint16_t mid; /* ONEFS_WAITING_FOR_OPLOCK */
} data;
};
/**
* Internal list of files (along with additional state) that have outstanding
* oplocks or requests for oplocks.
*/
struct onefs_callback_record *callback_recs;
/**
* Convert a onefs_callback_record to a debug string using the dbg_ctx().
*/
const char *onefs_cb_record_str_dbg(const struct onefs_callback_record *r)
{
char *result;
if (r == NULL) {
result = talloc_strdup(debug_ctx(), "NULL callback record");
return result;
}
switch (r->state) {
case ONEFS_OPEN_FILE:
result = talloc_asprintf(debug_ctx(), "cb record %llu for "
"file %s", r->id,
fsp_str_dbg(r->data.fsp));
case ONEFS_WAITING_FOR_OPLOCK:
result = talloc_asprintf(debug_ctx(), "cb record %llu for "
"pending mid %d", r->id,
(int)r->data.mid);
break;
default:
result = talloc_asprintf(debug_ctx(), "cb record %llu unknown "
"state %d", r->id, r->state);
break;
}
return result;
}
/**
* Traverse the list of onefs_callback_records and print all entries.
*/
static void debug_cb_records(const char *fn)
{
struct onefs_callback_record *rec;
if (DEBUGLEVEL < 10)
return;
DEBUG(10, ("cb records (%s):\n", fn));
for (rec = callback_recs; rec; rec = rec->next) {
DEBUGADD(10, ("%s\n", onefs_cb_record_str_dbg(rec)));
}
}
/**
* Find a callback record in the list of outstanding oplock operations.
*
* Once n ifs_createfile requests an oplock on a file, the kernel communicates
* with samba via the oplock event channel by sending events that reference an
* id. This function maps that id to the onefs_callback_record that was
* created for it during the initial setup on open (onefs_oplock_wait_record).
* When a matching id is found in the onefs_callback_record list, the
* callback_type is checked to make sure the record is in in the correct
* state.
*/
static struct onefs_callback_record *onefs_find_cb(uint64_t id,
enum onefs_callback_state expected_state)
{
struct onefs_callback_record *rec;
debug_cb_records("onefs_find_cb");
for (rec = callback_recs; rec; rec = rec->next) {
if (rec->id == id) {
DEBUG(10, ("found %s\n",
onefs_cb_record_str_dbg(rec)));
break;
}
}
if (rec == NULL) {
DEBUG(5, ("Could not find callback record for id %llu\n", id));
return NULL;
}
if (rec->state != expected_state) {
DEBUG(0, ("Expected cb type %d, got %s", expected_state,
onefs_cb_record_str_dbg(rec)));
SMB_ASSERT(0);
return NULL;
}
return rec;
}
/**
* Remove and free a callback record from the callback record list.
*/
void destroy_onefs_callback_record(uint64_t id)
{
struct onefs_callback_record *rec;
debug_cb_records("destroy_onefs_callback_record");
if (id == 0) {
DEBUG(10, ("destroy_onefs_callback_record: Nothing to "
"destroy\n"));
return;
}
for (rec = callback_recs; rec; rec = rec->next) {
if (rec->id == id) {
DLIST_REMOVE(callback_recs, rec);
SAFE_FREE(rec);
DEBUG(10, ("removed cb rec %llu\n", id));
return;
}
}
DEBUG(0, ("Could not find cb rec %llu to delete", id));
SMB_ASSERT(0);
}
/**
* Initialize a callback record and add it to the list of outstanding callback
* records.
*
* This is called in the open path before ifs_createfile so an id can be
* passed in. Each callback record can be in one of two states:
*
* 1. WAITING_FOR_OPLOCK: This is the initial state for all callback
* records. If ifs_createfile can be completed syncronously without needing
* to break any level I oplocks, the state is transitioned to OPEN_FILE.
* Otherwise ifs_createfile will finish asynchronously and the open is
* deferred. When the necessary level I opocks have been broken, and the
* open can be done, an event is sent by the kernel on the oplock event
* channel, which is handled by semlock_available_handler. At this point
* the deferred open is retried. Unless a level I oplock was acquired by
* another client, ifs_createfile will now complete synchronously.
*
* 2. OPEN_FILE: Once ifs_createfile completes, the callback record is
* transitioned to this state via onefs_set_oplock_callback.
*/
uint64_t onefs_oplock_wait_record(uint16_t mid)
{
struct onefs_callback_record *result;
static uint64_t id_generator = 0;
if (!(result = SMB_MALLOC_P(struct onefs_callback_record))) {
DEBUG(0, ("talloc failed\n"));
return 0;
}
memset(result, '\0', sizeof(result));
id_generator += 1;
if (id_generator == 0) {
/* Wow, that's a long-running smbd... */
id_generator += 1;
}
result->id = id_generator;
result->state = ONEFS_WAITING_FOR_OPLOCK;
result->data.mid = mid;
DLIST_ADD(callback_recs, result);
DEBUG(10, ("New cb rec %llu created\n", result->id));
return result->id;
}
/**
* Transition the callback record state to OPEN_FILE.
*
* This is called after the file is opened and an fsp struct has been
* allocated. The mid is dropped in favor of storing the fsp.
*/
void onefs_set_oplock_callback(uint64_t id, files_struct *fsp)
{
struct onefs_callback_record *cb;
char *msg;
DEBUG(10, ("onefs_set_oplock_callback called for cb rec %llu\n", id));
if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) {
if (asprintf(&msg, "Got invalid callback %lld\n", id) != -1) {
smb_panic(msg);
}
smb_panic("Got invalid callback id\n");
}
/*
* Paranoia check
*/
if (open_was_deferred(cb->data.mid)) {
if (asprintf(&msg, "Trying to upgrade callback for deferred "
"open mid=%d\n", cb->data.mid) != -1) {
smb_panic(msg);
}
smb_panic("Trying to upgrade callback for deferred open "
"mid\n");
}
cb->state = ONEFS_OPEN_FILE;
cb->data.fsp = fsp;
}
/**
* Using a callback record, initialize a share mode entry to pass to
* share_mode_entry_to_message to send samba IPC messages.
*/
static void init_share_mode_entry(struct share_mode_entry *sme,
struct onefs_callback_record *cb,
int op_type)
{
ZERO_STRUCT(*sme);
sme->pid = procid_self();
sme->op_type = op_type;
sme->id = cb->data.fsp->file_id;
sme->share_file_id = cb->data.fsp->fh->gen_id;
}
/**
* Callback when a break-to-none event is received from the kernel.
*
* On OneFS level 1 oplocks are always broken to level 2 first, therefore an
* async level 2 break message is always sent when breaking to none. The
* downside of this is that OneFS currently has no way to express breaking
* directly from level 1 to none.
*/
static void oplock_break_to_none_handler(uint64_t id)
{
struct onefs_callback_record *cb;
struct share_mode_entry sme;
char msg[MSG_SMB_SHARE_MODE_ENTRY_SIZE];
DEBUG(10, ("oplock_break_to_none_handler called for id %llu\n", id));
if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) {
DEBUG(3, ("oplock_break_to_none_handler: could not find "
"callback id %llu\n", id));
return;
}
DEBUG(10, ("oplock_break_to_none_handler called for file %s\n",
fsp_str_dbg(cb->data.fsp)));
init_share_mode_entry(&sme, cb, FORCE_OPLOCK_BREAK_TO_NONE);
share_mode_entry_to_message(msg, &sme);
messaging_send_buf(smbd_messaging_context(),
sme.pid,
MSG_SMB_ASYNC_LEVEL2_BREAK,
(uint8_t *)msg,
MSG_SMB_SHARE_MODE_ENTRY_SIZE);
/*
* We could still receive an OPLOCK_REVOKED message, so keep the
* oplock_callback_id around.
*/
}
/**
* Callback when a break-to-level2 event is received from the kernel.
*
* Breaks from level 1 to level 2.
*/
static void oplock_break_to_level_two_handler(uint64_t id)
{
struct onefs_callback_record *cb;
struct share_mode_entry sme;
char msg[MSG_SMB_SHARE_MODE_ENTRY_SIZE];
DEBUG(10, ("oplock_break_to_level_two_handler called for id %llu\n",
id));
if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) {
DEBUG(3, ("oplock_break_to_level_two_handler: could not find "
"callback id %llu\n", id));
return;
}
DEBUG(10, ("oplock_break_to_level_two_handler called for file %s\n",
fsp_str_dbg(cb->data.fsp)));
init_share_mode_entry(&sme, cb, LEVEL_II_OPLOCK);
share_mode_entry_to_message(msg, &sme);
messaging_send_buf(smbd_messaging_context(),
sme.pid,
MSG_SMB_BREAK_REQUEST,
(uint8_t *)msg,
MSG_SMB_SHARE_MODE_ENTRY_SIZE);
/*
* We could still receive an OPLOCK_REVOKED or OPLOCK_BREAK_TO_NONE
* message, so keep the oplock_callback_id around.
*/
}
/**
* Revoke an oplock from an unresponsive client.
*
* The kernel will send this message when it times out waiting for a level 1
* oplock break to be acknowledged by the client. The oplock is then
* immediately removed.
*/
static void oplock_revoked_handler(uint64_t id)
{
struct onefs_callback_record *cb;
files_struct *fsp = NULL;
DEBUG(10, ("oplock_revoked_handler called for id %llu\n", id));
if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) {
DEBUG(3, ("oplock_revoked_handler: could not find "
"callback id %llu\n", id));
return;
}
fsp = cb->data.fsp;
SMB_ASSERT(fsp->oplock_timeout == NULL);
DEBUG(0,("Level 1 oplock break failed for file %s. Forcefully "
"revoking oplock\n", fsp_str_dbg(fsp)));
global_client_failed_oplock_break = True;
remove_oplock(fsp);
/*
* cb record is cleaned up in fsp ext data destructor on close, so
* leave it in the list.
*/
}
/**
* Asynchronous ifs_createfile callback
*
* If ifs_createfile had to asynchronously break any oplocks, this function is
* called when the kernel sends an event that the open can be retried.
*/
static void semlock_available_handler(uint64_t id)
{
struct onefs_callback_record *cb;
DEBUG(10, ("semlock_available_handler called: %llu\n", id));
if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) {
DEBUG(5, ("semlock_available_handler: Did not find callback "
"%llu\n", id));
return;
}
DEBUG(10, ("Got semlock available for mid %d\n", cb->data.mid));
/* Paranoia check */
if (!(open_was_deferred(cb->data.mid))) {
char *msg;
if (asprintf(&msg, "Semlock available on an open that wasn't "
"deferred: %s\n",
onefs_cb_record_str_dbg(cb)) != -1) {
smb_panic(msg);
}
smb_panic("Semlock available on an open that wasn't "
"deferred\n");
}
schedule_deferred_open_smb_message(cb->data.mid);
/* Cleanup the callback record since the open will be retried. */
destroy_onefs_callback_record(id);
return;
}
/**
* Asynchronous ifs_createfile failure callback
*
* If ifs_createfile had to asynchronously break any oplocks, but an error was
* encountered in the kernel, the open will be retried with the state->failed
* set to true. This will prompt the open path to send an INTERNAL_ERROR
* error message to the client.
*/
static void semlock_async_failure_handler(uint64_t id)
{
struct onefs_callback_record *cb;
struct pending_message_list *pml;
struct deferred_open_record *state;
DEBUG(1, ("semlock_async_failure_handler called: %llu\n", id));
if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) {
DEBUG(5, ("semlock_async_failure_handler: Did not find callback "
"%llu\n", id));
return;
}
DEBUG(1, ("Got semlock_async_failure message for mid %d\n", cb->data.mid));
/* Paranoia check */
if (!(open_was_deferred(cb->data.mid))) {
char *msg;
if (asprintf(&msg, "Semlock failure on an open that wasn't "
"deferred: %s\n",
onefs_cb_record_str_dbg(cb)) != -1) {
smb_panic(msg);
}
smb_panic("Semlock failure on an open that wasn't deferred\n");
}
/* Find the actual deferred open record. */
if (!(pml = get_open_deferred_message(cb->data.mid))) {
DEBUG(0, ("Could not find deferred request for "
"mid %d\n", cb->data.mid));
destroy_onefs_callback_record(id);
return;
}
state = (struct deferred_open_record *)pml->private_data.data;
/* Update to failed so the client can be notified on retried open. */
state->failed = true;
/* Schedule deferred open for immediate retry. */
schedule_deferred_open_smb_message(cb->data.mid);
/* Cleanup the callback record here since the open will be retried. */
destroy_onefs_callback_record(id);
return;
}
/**
* OneFS acquires all oplocks via ifs_createfile, so this is a no-op.
*/
static bool onefs_set_kernel_oplock(struct kernel_oplocks *_ctx,
files_struct *fsp, int oplock_type) {
return true;
}
/**
* Release the kernel oplock.
*/
static void onefs_release_kernel_oplock(struct kernel_oplocks *_ctx,
files_struct *fsp, int oplock_type)
{
enum oplock_type oplock = onefs_samba_oplock_to_oplock(oplock_type);
DEBUG(10, ("onefs_release_kernel_oplock: Releasing %s to type %s\n",
fsp_str_dbg(fsp), onefs_oplock_str(oplock)));
if (fsp->fh->fd == -1) {
DEBUG(1, ("no fd\n"));
return;
}
/* Downgrade oplock to either SHARED or NONE. */
if (ifs_oplock_downgrade(fsp->fh->fd, oplock)) {
DEBUG(1,("ifs_oplock_downgrade failed: %s\n",
strerror(errno)));
}
}
/**
* Wrap ifs_semlock_write so it is only called on operations that aren't
* already contended in the kernel.
*/
static void onefs_semlock_write(int fd, enum level2_contention_type type,
enum semlock_operation semlock_op)
{
int ret;
switch (type) {
case LEVEL2_CONTEND_ALLOC_GROW:
case LEVEL2_CONTEND_POSIX_BRL:
DEBUG(10, ("Taking %d write semlock for cmd %d on fd: %d\n",
semlock_op, type, fd));
ret = ifs_semlock_write(fd, semlock_op);
if (ret) {
DEBUG(0,("ifs_semlock_write failed taking %d write "
"semlock for cmd %d on fd: %d: %s",
semlock_op, type, fd, strerror(errno)));
}
break;
default:
DEBUG(10, ("Skipping write semlock for cmd %d on fd: %d\n",
type, fd));
}
}
/**
* Contend level 2 oplocks in the kernel and smbd.
*
* Taking a write semlock will contend all level 2 oplocks in all smbds across
* the cluster except the fsp's own level 2 oplock. This lack of
* self-contention is a limitation of the current OneFS kernel oplocks
* implementation. Luckily it is easy to contend our own level 2 oplock by
* checking the the fsp's oplock_type. If it's a level2, send a break message
* to the client and remove the oplock.
*/
static void onefs_contend_level2_oplocks_begin(files_struct *fsp,
enum level2_contention_type type)
{
/* Take care of level 2 kernel contention. */
onefs_semlock_write(fsp->fh->fd, type, SEMLOCK_LOCK);
/* Take care of level 2 self contention. */
if (LEVEL_II_OPLOCK_TYPE(fsp->oplock_type))
break_level2_to_none_async(fsp);
}
/**
* Unlock the write semlock when the level 2 contending operation ends.
*/
static void onefs_contend_level2_oplocks_end(files_struct *fsp,
enum level2_contention_type type)
{
/* Take care of level 2 kernel contention. */
onefs_semlock_write(fsp->fh->fd, type, SEMLOCK_UNLOCK);
}
/**
* Return string value of onefs oplock types.
*/
const char *onefs_oplock_str(enum oplock_type onefs_oplock_type)
{
switch (onefs_oplock_type) {
case OPLOCK_NONE:
return "OPLOCK_NONE";
case OPLOCK_EXCLUSIVE:
return "OPLOCK_EXCLUSIVE";
case OPLOCK_BATCH:
return "OPLOCK_BATCH";
case OPLOCK_SHARED:
return "OPLOCK_SHARED";
default:
break;
}
return "UNKNOWN";
}
/**
* Convert from onefs to samba oplock.
*/
int onefs_oplock_to_samba_oplock(enum oplock_type onefs_oplock)
{
switch (onefs_oplock) {
case OPLOCK_NONE:
return NO_OPLOCK;
case OPLOCK_EXCLUSIVE:
return EXCLUSIVE_OPLOCK;
case OPLOCK_BATCH:
return BATCH_OPLOCK;
case OPLOCK_SHARED:
return LEVEL_II_OPLOCK;
default:
DEBUG(0, ("unknown oplock type %d found\n", onefs_oplock));
break;
}
return NO_OPLOCK;
}
/**
* Convert from samba to onefs oplock.
*/
enum oplock_type onefs_samba_oplock_to_oplock(int samba_oplock_type)
{
if (BATCH_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_BATCH;
if (EXCLUSIVE_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_EXCLUSIVE;
if (LEVEL_II_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_SHARED;
return OPLOCK_NONE;
}
/**
* Oplock event handler.
*
* Call into the event system dispatcher to handle each event.
*/
static void onefs_oplocks_read_fde_handler(struct event_context *ev,
struct fd_event *fde,
uint16_t flags,
void *private_data)
{
struct onefs_oplocks_context *ctx =
talloc_get_type(private_data, struct onefs_oplocks_context);
if (oplocks_event_dispatcher(ctx->onefs_ops)) {
DEBUG(0, ("oplocks_event_dispatcher failed: %s\n",
strerror(errno)));
}
}
/**
* Setup kernel oplocks
*/
static const struct kernel_oplocks_ops onefs_koplocks_ops = {
.set_oplock = onefs_set_kernel_oplock,
.release_oplock = onefs_release_kernel_oplock,
.contend_level2_oplocks_begin = onefs_contend_level2_oplocks_begin,
.contend_level2_oplocks_end = onefs_contend_level2_oplocks_end,
};
static const struct oplocks_event_ops onefs_dispatch_ops = {
.oplock_break_to_none = oplock_break_to_none_handler,
.oplock_break_to_level_two = oplock_break_to_level_two_handler,
.oplock_revoked = oplock_revoked_handler,
.semlock_available = semlock_available_handler,
.semlock_async_failure = semlock_async_failure_handler,
};
struct kernel_oplocks *onefs_init_kernel_oplocks(TALLOC_CTX *mem_ctx)
{
struct kernel_oplocks *_ctx = NULL;
struct onefs_oplocks_context *ctx = NULL;
struct procoptions po = PROCOPTIONS_INIT;
DEBUG(10, ("onefs_init_kernel_oplocks called\n"));
/* Set the non-blocking proc flag */
po.po_flags_on |= P_NON_BLOCKING_SEMLOCK;
if (setprocoptions(&po) != 0) {
DEBUG(0, ("setprocoptions failed: %s.\n", strerror(errno)));
return NULL;
}
/* Setup the oplock contexts */
_ctx = talloc_zero(mem_ctx, struct kernel_oplocks);
if (!_ctx) {
return NULL;
}
ctx = talloc_zero(_ctx, struct onefs_oplocks_context);
if (!ctx) {
goto err_out;
}
_ctx->ops = &onefs_koplocks_ops;
_ctx->flags = (KOPLOCKS_LEVEL2_SUPPORTED |
KOPLOCKS_DEFERRED_OPEN_NOTIFICATION |
KOPLOCKS_TIMEOUT_NOTIFICATION |
KOPLOCKS_OPLOCK_BROKEN_NOTIFICATION);
_ctx->private_data = ctx;
ctx->ctx = _ctx;
ctx->onefs_ops = &onefs_dispatch_ops;
/* Register an kernel event channel for oplocks */
ctx->onefs_event_fd = oplocks_event_register();
if (ctx->onefs_event_fd == -1) {
DEBUG(0, ("oplocks_event_register failed: %s\n",
strerror(errno)));
goto err_out;
}
DEBUG(10, ("oplock event_fd = %d\n", ctx->onefs_event_fd));
/* Register the oplock event_fd with samba's event system */
ctx->read_fde = event_add_fd(smbd_event_context(),
ctx,
ctx->onefs_event_fd,
EVENT_FD_READ,
onefs_oplocks_read_fde_handler,
ctx);
return _ctx;
err_out:
talloc_free(_ctx);
return NULL;
}
#else
void oplock_onefs_dummy(void);
void oplock_onefs_dummy(void) {}
#endif /* HAVE_ONEFS */