diff options
author | Tim Prouty <tprouty@samba.org> | 2009-02-20 13:27:39 -0800 |
---|---|---|
committer | Tim Prouty <tprouty@samba.org> | 2009-02-21 17:10:42 -0800 |
commit | 8ec9903426ec4e559df8ac8306a8ebcdf0706176 (patch) | |
tree | d851e433a5f634e1b977b00b489d61c22a050366 /source3 | |
parent | 0dcfa9ce1baa9f2074a002fdb5c8b88cc5db28db (diff) | |
download | samba-8ec9903426ec4e559df8ac8306a8ebcdf0706176.tar.gz samba-8ec9903426ec4e559df8ac8306a8ebcdf0706176.tar.bz2 samba-8ec9903426ec4e559df8ac8306a8ebcdf0706176.zip |
s3 OneFS: Add an atomic sendfile implementation
Diffstat (limited to 'source3')
-rw-r--r-- | source3/modules/onefs.h | 10 | ||||
-rw-r--r-- | source3/modules/onefs_system.c | 257 | ||||
-rw-r--r-- | source3/modules/vfs_onefs.c | 15 |
3 files changed, 282 insertions, 0 deletions
diff --git a/source3/modules/onefs.h b/source3/modules/onefs.h index ea452a454d..a70664bbf3 100644 --- a/source3/modules/onefs.h +++ b/source3/modules/onefs.h @@ -47,6 +47,8 @@ enum onefs_acl_wire_format #define PARM_ATIME_STATIC_DEFAULT NULL #define PARM_ATIME_SLOP "atime now slop" #define PARM_ATIME_SLOP_DEFAULT 0 +#define PARM_ATOMIC_SENDFILE "atomic sendfile" +#define PARM_ATOMIC_SENDFILE_DEFAULT true #define PARM_CREATOR_OWNER_GETS_FULL_CONTROL "creator owner gets full control" #define PARM_CREATOR_OWNER_GETS_FULL_CONTROL_DEFAULT true #define PARM_CTIME_NOW "ctime now files" @@ -63,6 +65,10 @@ enum onefs_acl_wire_format #define PARM_MTIME_SLOP_DEFAULT 0 #define PARM_USE_READDIRPLUS "use readdirplus" #define PARM_USE_READDIRPLUS_DEFAULT true +#define PARM_SENDFILE_LARGE_READS "sendfile large reads" +#define PARM_SENDFILE_LARGE_READS_DEFAULT false +#define PARM_SENDFILE_SAFE "sendfile safe" +#define PARM_SENDFILE_SAFE_DEFAULT true #define PARM_SIMPLE_FILE_SHARING_COMPATIBILITY_MODE "simple file sharing compatibility mode" #define PARM_SIMPLE_FILE_SHARING_COMPATIBILITY_MODE_DEFAULT false #define PARM_UNMAPPABLE_SIDS_DENY_EVERYONE "unmappable sids deny everyone" @@ -254,6 +260,10 @@ int onefs_sys_create_file(connection_struct *conn, uint32_t ntfs_flags, int *granted_oplock); +ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd, + const DATA_BLOB *header, SMB_OFF_T offset, + size_t count); + ssize_t onefs_sys_recvfile(int fromfd, int tofd, SMB_OFF_T offset, size_t count); diff --git a/source3/modules/onefs_system.c b/source3/modules/onefs_system.c index 3a86b4b815..10802895cd 100644 --- a/source3/modules/onefs_system.c +++ b/source3/modules/onefs_system.c @@ -178,6 +178,263 @@ int onefs_sys_create_file(connection_struct *conn, } /** + * FreeBSD based sendfile implementation that allows for atomic semantics. + */ +static ssize_t onefs_sys_do_sendfile(int tofd, int fromfd, + const DATA_BLOB *header, SMB_OFF_T offset, size_t count, bool atomic) +{ + size_t total=0; + struct sf_hdtr hdr; + struct iovec hdtrl; + size_t hdr_len = 0; + int flags = 0; + + if (atomic) { + flags = SF_ATOMIC; + } + + hdr.headers = &hdtrl; + hdr.hdr_cnt = 1; + hdr.trailers = NULL; + hdr.trl_cnt = 0; + + /* Set up the header iovec. */ + if (header) { + hdtrl.iov_base = header->data; + hdtrl.iov_len = hdr_len = header->length; + } else { + hdtrl.iov_base = NULL; + hdtrl.iov_len = 0; + } + + total = count; + while (total + hdtrl.iov_len) { + SMB_OFF_T nwritten; + int ret; + + /* + * FreeBSD sendfile returns 0 on success, -1 on error. + * Remember, the tofd and fromfd are reversed..... :-). + * nwritten includes the header data sent. + */ + + do { + ret = sendfile(fromfd, tofd, offset, total, &hdr, + &nwritten, flags); + } while (ret == -1 && errno == EINTR); + + /* On error we're done. */ + if (ret == -1) { + return -1; + } + + /* + * If this was an ATOMIC sendfile, nwritten doesn't + * necessarily indicate an error. It could mean count > than + * what sendfile can handle atomically (usually 64K) or that + * there was a short read due to the file being truncated. + */ + if (nwritten == 0) { + return atomic ? 0 : -1; + } + + /* + * An atomic sendfile should never send partial data! + */ + if (atomic && nwritten != total + hdtrl.iov_len) { + DEBUG(0,("Atomic sendfile() sent partial data: " + "%llu of %d\n", nwritten, + total + hdtrl.iov_len)); + return -1; + } + + /* + * If this was a short (signal interrupted) write we may need + * to subtract it from the header data, or null out the header + * data altogether if we wrote more than hdtrl.iov_len bytes. + * We change nwritten to be the number of file bytes written. + */ + + if (hdtrl.iov_base && hdtrl.iov_len) { + if (nwritten >= hdtrl.iov_len) { + nwritten -= hdtrl.iov_len; + hdtrl.iov_base = NULL; + hdtrl.iov_len = 0; + } else { + hdtrl.iov_base = + (caddr_t)hdtrl.iov_base + nwritten; + hdtrl.iov_len -= nwritten; + nwritten = 0; + } + } + total -= nwritten; + offset += nwritten; + } + return count + hdr_len; +} + +/** + * Handles the subtleties of using sendfile with CIFS. + */ +ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd, + const DATA_BLOB *header, SMB_OFF_T offset, + size_t count) +{ + bool atomic = false; + ssize_t ret = 0; + + if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE, + PARM_ATOMIC_SENDFILE, + PARM_ATOMIC_SENDFILE_DEFAULT)) { + atomic = true; + } + + /* Try the sendfile */ + ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset, count, + atomic); + + /* If the sendfile wasn't atomic, we're done. */ + if (!atomic) { + DEBUG(10, ("non-atomic sendfile read %ul bytes", ret)); + return ret; + } + + /* + * Atomic sendfile takes care to not write anything to the socket + * until all of the requested bytes have been read from the file. + * There are two atomic cases that need to be handled. + * + * 1. The file was truncated causing less data to be read than was + * requested. In this case, we return back to the caller to + * indicate 0 bytes were written to the socket. This should + * prompt the caller to fallback to the standard read path: read + * the data, create a header that indicates how many bytes were + * actually read, and send the header/data back to the client. + * + * This saves us from standard sendfile behavior of sending a + * header promising more data then will actually be sent. The + * only two options are to close the socket and kill the client + * connection, or write a bunch of 0s. Closing the client + * connection is bad because there could actually be multiple + * sessions multiplexed from the same client that are all dropped + * because of a truncate. Writing the remaining data as 0s also + * isn't good, because the client will have an incorrect version + * of the file. If the file is written back to the server, the 0s + * will be written back. Fortunately, atomic sendfile allows us + * to avoid making this choice in most cases. + * + * 2. One downside of atomic sendfile, is that there is a limit on + * the number of bytes that can be sent atomically. The kernel + * has a limited amount of mbuf space that it can read file data + * into without exhausting the system's mbufs, so a buffer of + * length xfsize is used. The xfsize at the time of writing this + * is 64K. xfsize bytes are read from the file, and subsequently + * written to the socket. This makes it impossible to do the + * sendfile atomically for a byte count > xfsize. + * + * To cope with large requests, atomic sendfile returns -1 with + * errno set to E2BIG. Since windows maxes out at 64K writes, + * this is currently only a concern with non-windows clients. + * Posix extensions allow the full 24bit bytecount field to be + * used in ReadAndX, and clients such as smbclient and the linux + * cifs client can request up to 16MB reads! There are a few + * options for handling large sendfile requests. + * + * a. Fall back to the standard read path. This is unacceptable + * because it would require prohibitively large mallocs. + * + * b. Fall back to using samba's fake_send_file which emulates + * the kernel sendfile in userspace. This still has the same + * problem of sending the header before all of the data has + * been read, so it doesn't buy us anything, and has worse + * performance than the kernel's zero-copy sendfile. + * + * c. Use non-atomic sendfile syscall to attempt a zero copy + * read, and hope that there isn't a short read due to + * truncation. In the case of a short read, there are two + * options: + * + * 1. Kill the client connection + * + * 2. Write zeros to the socket for the remaining bytes + * promised in the header. + * + * It is safer from a data corruption perspective to kill the + * client connection, so this is our default behavior, but if + * this causes problems this can be configured to write zeros + * via smb.conf. + */ + + /* Handle case 1: short read -> truncated file. */ + if (ret == 0) { + return ret; + } + + /* Handle case 2: large read. */ + if (ret == -1 && errno == E2BIG) { + + if (!lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE, + PARM_SENDFILE_LARGE_READS, + PARM_SENDFILE_LARGE_READS_DEFAULT)) { + DEBUG(3, ("Not attempting non-atomic large sendfile: " + "%lu bytes\n", count)); + return 0; + } + + if (count < 0x10000) { + DEBUG(0, ("Count < 2^16 and E2BIG was returned! %lu", + count)); + } + + DEBUG(10, ("attempting non-atomic large sendfile: %lu bytes\n", + count)); + + /* Try a non-atomic sendfile. */ + ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset, + count, false); + /* Real error: kill the client connection. */ + if (ret == -1) { + DEBUG(1, ("error on non-atomic large sendfile " + "(%lu bytes): %s\n", count, + strerror(errno))); + return ret; + } + + /* Short read: kill the client connection. */ + if (ret != count + header->length) { + DEBUG(1, ("short read on non-atomic large sendfile " + "(%lu of %lu bytes): %s\n", ret, count, + strerror(errno))); + + /* + * Returning ret here would cause us to drop into the + * codepath that calls sendfile_short_send, which + * sends the client a bunch of zeros instead. + * Returning -1 kills the connection. + */ + if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE, + PARM_SENDFILE_SAFE, + PARM_SENDFILE_SAFE_DEFAULT)) { + return -1; + } + + return ret; + } + + DEBUG(10, ("non-atomic large sendfile successful\n")); + } + + /* There was error in the atomic sendfile. */ + if (ret == -1) { + DEBUG(1, ("error on %s sendfile (%lu bytes): %s\n", + atomic ? "atomic" : "non-atomic", + count, strerror(errno))); + } + + return ret; +} + +/** * Only talloc the spill buffer once (reallocing when necessary). */ static char *get_spill_buffer(size_t new_count) diff --git a/source3/modules/vfs_onefs.c b/source3/modules/vfs_onefs.c index f0c6a9d8bb..60c2c977a4 100644 --- a/source3/modules/vfs_onefs.c +++ b/source3/modules/vfs_onefs.c @@ -156,6 +156,19 @@ static int onefs_open(vfs_handle_struct *handle, const char *fname, return SMB_VFS_NEXT_OPEN(handle, fname, fsp, flags, mode); } +static ssize_t onefs_sendfile(vfs_handle_struct *handle, int tofd, + files_struct *fromfsp, const DATA_BLOB *header, + SMB_OFF_T offset, size_t count) +{ + ssize_t result; + + START_PROFILE_BYTES(syscall_sendfile, count); + result = onefs_sys_sendfile(handle->conn, tofd, fromfsp->fh->fd, + header, offset, count); + END_PROFILE(syscall_sendfile); + return result; +} + static ssize_t onefs_recvfile(vfs_handle_struct *handle, int fromfd, files_struct *tofsp, SMB_OFF_T offset, size_t count) @@ -340,6 +353,8 @@ static vfs_op_tuple onefs_ops[] = { SMB_VFS_LAYER_OPAQUE}, {SMB_VFS_OP(onefs_close), SMB_VFS_OP_CLOSE, SMB_VFS_LAYER_TRANSPARENT}, + {SMB_VFS_OP(onefs_sendfile), SMB_VFS_OP_SENDFILE, + SMB_VFS_LAYER_OPAQUE}, {SMB_VFS_OP(onefs_recvfile), SMB_VFS_OP_RECVFILE, SMB_VFS_LAYER_OPAQUE}, {SMB_VFS_OP(onefs_rename), SMB_VFS_OP_RENAME, |