summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Prouty <tprouty@samba.org>2009-02-20 13:27:39 -0800
committerTim Prouty <tprouty@samba.org>2009-02-21 17:10:42 -0800
commit8ec9903426ec4e559df8ac8306a8ebcdf0706176 (patch)
treed851e433a5f634e1b977b00b489d61c22a050366
parent0dcfa9ce1baa9f2074a002fdb5c8b88cc5db28db (diff)
downloadsamba-8ec9903426ec4e559df8ac8306a8ebcdf0706176.tar.gz
samba-8ec9903426ec4e559df8ac8306a8ebcdf0706176.tar.bz2
samba-8ec9903426ec4e559df8ac8306a8ebcdf0706176.zip
s3 OneFS: Add an atomic sendfile implementation
-rw-r--r--source3/modules/onefs.h10
-rw-r--r--source3/modules/onefs_system.c257
-rw-r--r--source3/modules/vfs_onefs.c15
3 files changed, 282 insertions, 0 deletions
diff --git a/source3/modules/onefs.h b/source3/modules/onefs.h
index ea452a454d..a70664bbf3 100644
--- a/source3/modules/onefs.h
+++ b/source3/modules/onefs.h
@@ -47,6 +47,8 @@ enum onefs_acl_wire_format
#define PARM_ATIME_STATIC_DEFAULT NULL
#define PARM_ATIME_SLOP "atime now slop"
#define PARM_ATIME_SLOP_DEFAULT 0
+#define PARM_ATOMIC_SENDFILE "atomic sendfile"
+#define PARM_ATOMIC_SENDFILE_DEFAULT true
#define PARM_CREATOR_OWNER_GETS_FULL_CONTROL "creator owner gets full control"
#define PARM_CREATOR_OWNER_GETS_FULL_CONTROL_DEFAULT true
#define PARM_CTIME_NOW "ctime now files"
@@ -63,6 +65,10 @@ enum onefs_acl_wire_format
#define PARM_MTIME_SLOP_DEFAULT 0
#define PARM_USE_READDIRPLUS "use readdirplus"
#define PARM_USE_READDIRPLUS_DEFAULT true
+#define PARM_SENDFILE_LARGE_READS "sendfile large reads"
+#define PARM_SENDFILE_LARGE_READS_DEFAULT false
+#define PARM_SENDFILE_SAFE "sendfile safe"
+#define PARM_SENDFILE_SAFE_DEFAULT true
#define PARM_SIMPLE_FILE_SHARING_COMPATIBILITY_MODE "simple file sharing compatibility mode"
#define PARM_SIMPLE_FILE_SHARING_COMPATIBILITY_MODE_DEFAULT false
#define PARM_UNMAPPABLE_SIDS_DENY_EVERYONE "unmappable sids deny everyone"
@@ -254,6 +260,10 @@ int onefs_sys_create_file(connection_struct *conn,
uint32_t ntfs_flags,
int *granted_oplock);
+ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
+ const DATA_BLOB *header, SMB_OFF_T offset,
+ size_t count);
+
ssize_t onefs_sys_recvfile(int fromfd, int tofd, SMB_OFF_T offset,
size_t count);
diff --git a/source3/modules/onefs_system.c b/source3/modules/onefs_system.c
index 3a86b4b815..10802895cd 100644
--- a/source3/modules/onefs_system.c
+++ b/source3/modules/onefs_system.c
@@ -178,6 +178,263 @@ int onefs_sys_create_file(connection_struct *conn,
}
/**
+ * FreeBSD based sendfile implementation that allows for atomic semantics.
+ */
+static ssize_t onefs_sys_do_sendfile(int tofd, int fromfd,
+ const DATA_BLOB *header, SMB_OFF_T offset, size_t count, bool atomic)
+{
+ size_t total=0;
+ struct sf_hdtr hdr;
+ struct iovec hdtrl;
+ size_t hdr_len = 0;
+ int flags = 0;
+
+ if (atomic) {
+ flags = SF_ATOMIC;
+ }
+
+ hdr.headers = &hdtrl;
+ hdr.hdr_cnt = 1;
+ hdr.trailers = NULL;
+ hdr.trl_cnt = 0;
+
+ /* Set up the header iovec. */
+ if (header) {
+ hdtrl.iov_base = header->data;
+ hdtrl.iov_len = hdr_len = header->length;
+ } else {
+ hdtrl.iov_base = NULL;
+ hdtrl.iov_len = 0;
+ }
+
+ total = count;
+ while (total + hdtrl.iov_len) {
+ SMB_OFF_T nwritten;
+ int ret;
+
+ /*
+ * FreeBSD sendfile returns 0 on success, -1 on error.
+ * Remember, the tofd and fromfd are reversed..... :-).
+ * nwritten includes the header data sent.
+ */
+
+ do {
+ ret = sendfile(fromfd, tofd, offset, total, &hdr,
+ &nwritten, flags);
+ } while (ret == -1 && errno == EINTR);
+
+ /* On error we're done. */
+ if (ret == -1) {
+ return -1;
+ }
+
+ /*
+ * If this was an ATOMIC sendfile, nwritten doesn't
+ * necessarily indicate an error. It could mean count > than
+ * what sendfile can handle atomically (usually 64K) or that
+ * there was a short read due to the file being truncated.
+ */
+ if (nwritten == 0) {
+ return atomic ? 0 : -1;
+ }
+
+ /*
+ * An atomic sendfile should never send partial data!
+ */
+ if (atomic && nwritten != total + hdtrl.iov_len) {
+ DEBUG(0,("Atomic sendfile() sent partial data: "
+ "%llu of %d\n", nwritten,
+ total + hdtrl.iov_len));
+ return -1;
+ }
+
+ /*
+ * If this was a short (signal interrupted) write we may need
+ * to subtract it from the header data, or null out the header
+ * data altogether if we wrote more than hdtrl.iov_len bytes.
+ * We change nwritten to be the number of file bytes written.
+ */
+
+ if (hdtrl.iov_base && hdtrl.iov_len) {
+ if (nwritten >= hdtrl.iov_len) {
+ nwritten -= hdtrl.iov_len;
+ hdtrl.iov_base = NULL;
+ hdtrl.iov_len = 0;
+ } else {
+ hdtrl.iov_base =
+ (caddr_t)hdtrl.iov_base + nwritten;
+ hdtrl.iov_len -= nwritten;
+ nwritten = 0;
+ }
+ }
+ total -= nwritten;
+ offset += nwritten;
+ }
+ return count + hdr_len;
+}
+
+/**
+ * Handles the subtleties of using sendfile with CIFS.
+ */
+ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
+ const DATA_BLOB *header, SMB_OFF_T offset,
+ size_t count)
+{
+ bool atomic = false;
+ ssize_t ret = 0;
+
+ if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+ PARM_ATOMIC_SENDFILE,
+ PARM_ATOMIC_SENDFILE_DEFAULT)) {
+ atomic = true;
+ }
+
+ /* Try the sendfile */
+ ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset, count,
+ atomic);
+
+ /* If the sendfile wasn't atomic, we're done. */
+ if (!atomic) {
+ DEBUG(10, ("non-atomic sendfile read %ul bytes", ret));
+ return ret;
+ }
+
+ /*
+ * Atomic sendfile takes care to not write anything to the socket
+ * until all of the requested bytes have been read from the file.
+ * There are two atomic cases that need to be handled.
+ *
+ * 1. The file was truncated causing less data to be read than was
+ * requested. In this case, we return back to the caller to
+ * indicate 0 bytes were written to the socket. This should
+ * prompt the caller to fallback to the standard read path: read
+ * the data, create a header that indicates how many bytes were
+ * actually read, and send the header/data back to the client.
+ *
+ * This saves us from standard sendfile behavior of sending a
+ * header promising more data then will actually be sent. The
+ * only two options are to close the socket and kill the client
+ * connection, or write a bunch of 0s. Closing the client
+ * connection is bad because there could actually be multiple
+ * sessions multiplexed from the same client that are all dropped
+ * because of a truncate. Writing the remaining data as 0s also
+ * isn't good, because the client will have an incorrect version
+ * of the file. If the file is written back to the server, the 0s
+ * will be written back. Fortunately, atomic sendfile allows us
+ * to avoid making this choice in most cases.
+ *
+ * 2. One downside of atomic sendfile, is that there is a limit on
+ * the number of bytes that can be sent atomically. The kernel
+ * has a limited amount of mbuf space that it can read file data
+ * into without exhausting the system's mbufs, so a buffer of
+ * length xfsize is used. The xfsize at the time of writing this
+ * is 64K. xfsize bytes are read from the file, and subsequently
+ * written to the socket. This makes it impossible to do the
+ * sendfile atomically for a byte count > xfsize.
+ *
+ * To cope with large requests, atomic sendfile returns -1 with
+ * errno set to E2BIG. Since windows maxes out at 64K writes,
+ * this is currently only a concern with non-windows clients.
+ * Posix extensions allow the full 24bit bytecount field to be
+ * used in ReadAndX, and clients such as smbclient and the linux
+ * cifs client can request up to 16MB reads! There are a few
+ * options for handling large sendfile requests.
+ *
+ * a. Fall back to the standard read path. This is unacceptable
+ * because it would require prohibitively large mallocs.
+ *
+ * b. Fall back to using samba's fake_send_file which emulates
+ * the kernel sendfile in userspace. This still has the same
+ * problem of sending the header before all of the data has
+ * been read, so it doesn't buy us anything, and has worse
+ * performance than the kernel's zero-copy sendfile.
+ *
+ * c. Use non-atomic sendfile syscall to attempt a zero copy
+ * read, and hope that there isn't a short read due to
+ * truncation. In the case of a short read, there are two
+ * options:
+ *
+ * 1. Kill the client connection
+ *
+ * 2. Write zeros to the socket for the remaining bytes
+ * promised in the header.
+ *
+ * It is safer from a data corruption perspective to kill the
+ * client connection, so this is our default behavior, but if
+ * this causes problems this can be configured to write zeros
+ * via smb.conf.
+ */
+
+ /* Handle case 1: short read -> truncated file. */
+ if (ret == 0) {
+ return ret;
+ }
+
+ /* Handle case 2: large read. */
+ if (ret == -1 && errno == E2BIG) {
+
+ if (!lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+ PARM_SENDFILE_LARGE_READS,
+ PARM_SENDFILE_LARGE_READS_DEFAULT)) {
+ DEBUG(3, ("Not attempting non-atomic large sendfile: "
+ "%lu bytes\n", count));
+ return 0;
+ }
+
+ if (count < 0x10000) {
+ DEBUG(0, ("Count < 2^16 and E2BIG was returned! %lu",
+ count));
+ }
+
+ DEBUG(10, ("attempting non-atomic large sendfile: %lu bytes\n",
+ count));
+
+ /* Try a non-atomic sendfile. */
+ ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset,
+ count, false);
+ /* Real error: kill the client connection. */
+ if (ret == -1) {
+ DEBUG(1, ("error on non-atomic large sendfile "
+ "(%lu bytes): %s\n", count,
+ strerror(errno)));
+ return ret;
+ }
+
+ /* Short read: kill the client connection. */
+ if (ret != count + header->length) {
+ DEBUG(1, ("short read on non-atomic large sendfile "
+ "(%lu of %lu bytes): %s\n", ret, count,
+ strerror(errno)));
+
+ /*
+ * Returning ret here would cause us to drop into the
+ * codepath that calls sendfile_short_send, which
+ * sends the client a bunch of zeros instead.
+ * Returning -1 kills the connection.
+ */
+ if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+ PARM_SENDFILE_SAFE,
+ PARM_SENDFILE_SAFE_DEFAULT)) {
+ return -1;
+ }
+
+ return ret;
+ }
+
+ DEBUG(10, ("non-atomic large sendfile successful\n"));
+ }
+
+ /* There was error in the atomic sendfile. */
+ if (ret == -1) {
+ DEBUG(1, ("error on %s sendfile (%lu bytes): %s\n",
+ atomic ? "atomic" : "non-atomic",
+ count, strerror(errno)));
+ }
+
+ return ret;
+}
+
+/**
* Only talloc the spill buffer once (reallocing when necessary).
*/
static char *get_spill_buffer(size_t new_count)
diff --git a/source3/modules/vfs_onefs.c b/source3/modules/vfs_onefs.c
index f0c6a9d8bb..60c2c977a4 100644
--- a/source3/modules/vfs_onefs.c
+++ b/source3/modules/vfs_onefs.c
@@ -156,6 +156,19 @@ static int onefs_open(vfs_handle_struct *handle, const char *fname,
return SMB_VFS_NEXT_OPEN(handle, fname, fsp, flags, mode);
}
+static ssize_t onefs_sendfile(vfs_handle_struct *handle, int tofd,
+ files_struct *fromfsp, const DATA_BLOB *header,
+ SMB_OFF_T offset, size_t count)
+{
+ ssize_t result;
+
+ START_PROFILE_BYTES(syscall_sendfile, count);
+ result = onefs_sys_sendfile(handle->conn, tofd, fromfsp->fh->fd,
+ header, offset, count);
+ END_PROFILE(syscall_sendfile);
+ return result;
+}
+
static ssize_t onefs_recvfile(vfs_handle_struct *handle, int fromfd,
files_struct *tofsp, SMB_OFF_T offset,
size_t count)
@@ -340,6 +353,8 @@ static vfs_op_tuple onefs_ops[] = {
SMB_VFS_LAYER_OPAQUE},
{SMB_VFS_OP(onefs_close), SMB_VFS_OP_CLOSE,
SMB_VFS_LAYER_TRANSPARENT},
+ {SMB_VFS_OP(onefs_sendfile), SMB_VFS_OP_SENDFILE,
+ SMB_VFS_LAYER_OPAQUE},
{SMB_VFS_OP(onefs_recvfile), SMB_VFS_OP_RECVFILE,
SMB_VFS_LAYER_OPAQUE},
{SMB_VFS_OP(onefs_rename), SMB_VFS_OP_RENAME,