📄 zero-copy.9
字号:
+{+ __u32 type;+ __u32 size;+ __u8 data[0];+};++struct tcp_udp_v4_priv+{+ __u32 src;+ __u32 dst;+ __u16 sport;+ __u16 dport;+ int fd;+};++#ifdef __KERNEL__++#include <linux/skbuff.h>+#include <linux/pagevec.h>++struct zc_sock_bucket+{+ struct list_head list;+ rwlock_t lock;+};++struct zc_buf;+struct zsock;++struct zc_handler+{+ struct list_head zc_entry;+ int (* alloc_data)(struct zc_handler *, struct zc_buf *);+ int (* commit_data)(struct zc_handler *, struct zc_buf *);+ int (* setup)(struct zc_handler *, struct socket *, struct sock_zc_setup_data *);+ int (* cleanup)(struct zsock *);++ struct zc_sock_bucket *sock_bucket;+ unsigned int sock_bucket_number;++ atomic_t refcnt;+};++struct zc_buf+{+ struct zc_handler *zh;+ void *header;+ unsigned int header_size;+ unsigned int size;+ void *priv;+ int status;+ struct sk_buff *skb;+ int (* move_data)(struct zc_buf *zb, unsigned int sz);+ void *priv_data;+};++extern struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask);++enum zc_page_flags {+ ZC_PAGE_READY = 0,+};++enum zc_sock_flags {+ ZSK_DATA_READY = 0,+};++struct zc_page+{+ struct page *page;+ unsigned int page_offset;+ unsigned int size;+ unsigned int used;+ u32 seq;+ long flags;+ spinlock_t lock;+};++struct zsock+{+ struct list_head zc_entry;+ struct zc_handler *handler;+ atomic_t refcnt;+ struct sock *sk;+ int (* zc_alloc_data)(struct zc_buf *zb);+ int (* zc_commit_data)(struct zc_buf *zb);+ wait_queue_head_t zc_data_ready;+ rwlock_t zc_lock;+ struct zc_page *zc_pages;+ long zc_flags;+ unsigned int zc_page_num, zc_page_index;+ struct pagevec zc_lru_pvec;+ loff_t zc_pos;+ struct page *zc_cached_page;+ struct file *zc_file;+ u32 zc_seq_first;+ void *priv;+ unsigned int priv_size;+};++int sock_zc_setup_seq(struct zsock *zsk, u32 seq);+void sk_zc_fini(struct zsock *zsk);++int zc_setup(struct socket *sk, void *data, unsigned int size);+void zc_cleanup(struct zsock *zsk);++int zc_sock_alloc_data(struct zc_buf *zb);+int zc_sock_commit_data(struct zc_buf *zb);++int zc_alloc_data(struct zc_buf *zb);+int zc_commit_data(struct zc_buf *zb);++struct zsock *zsk_alloc(struct zc_handler *handler, void *priv, unsigned int priv_size, int (* insert)(struct zsock *zsk), gfp_t gfp_mask);+void zsk_free(struct zsock *zsk);++static inline void zc_handler_get(struct zc_handler *zc)+{+ atomic_inc(&zc->refcnt);+}++static inline void zc_handler_put(struct zc_handler *zc)+{+ if (atomic_dec_and_test(&zc->refcnt))+ printk(KERN_DEBUG "Releasing zc=%p.\n", zc);+}++static inline void *zsk_priv(struct zsock *zsk)+{+ return zsk->priv;+}++static inline void zsk_get(struct zsock *zsk)+{+ atomic_inc(&zsk->refcnt);+}++static inline void zsk_put(struct zsock *zsk)+{+ if (atomic_dec_and_test(&zsk->refcnt))+ zsk_free(zsk);+}++int tcp_udp_v4_zc_sock_insert(struct zsock *zsk);+int tcp_udp_v4_sock_zc_init(struct socket *sock, struct tcp_udp_v4_priv *priv);+extern struct zc_handler tcp_udp_v4_zc_handler;++int commit_page(struct zc_page *zp, struct file *file, struct address_space *mapping);+int prepare_page(struct zc_page *zp, struct zsock *zsk, struct file *file, struct address_space *mapping, + loff_t *ppos, loff_t count, struct pagevec *lru_pvec);+#endif /* __KERNEL__ */+#endif /* __ZEROCOPY_H */diff --git a/mm/filemap.c b/mm/filemap.c--- a/mm/filemap.c+++ b/mm/filemap.c@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page); * caller's lru-buffering pagevec. This function is specifically for * generic_file_write(). */-static inline struct page *+struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) {@@ -1692,6 +1692,8 @@ repeat: return page; } +EXPORT_SYMBOL_GPL(__grab_cache_page);+ /* * The logic we want is *diff --git a/net/core/Makefile b/net/core/Makefile--- a/net/core/Makefile+++ b/net/core/Makefile@@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \- gen_stats.o gen_estimator.o+ gen_stats.o gen_estimator.o zerocopy.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o diff --git a/net/core/datagram.c b/net/core/datagram.c--- a/net/core/datagram.c+++ b/net/core/datagram.c@@ -213,6 +213,10 @@ int skb_copy_datagram_iovec(const struct { int i, err, fraglen, end = 0; struct sk_buff *next = skb_shinfo(skb)->frag_list;++ if (skb->zerocopy)+ return 0;+ next_skb: fraglen = skb_headlen(skb); i = -1;@@ -364,6 +368,9 @@ int skb_copy_and_csum_datagram_iovec(con { unsigned int csum; int chunk = skb->len - hlen;+ + if (skb->zerocopy)+ return 0; /* Skip filled elements. * Pretty silly, look at memcpy_toiovec, though 8)diff --git a/net/core/skbuff.c b/net/core/skbuff.c--- a/net/core/skbuff.c+++ b/net/core/skbuff.c@@ -70,6 +70,7 @@ static kmem_cache_t *skbuff_head_cache __read_mostly; static kmem_cache_t *skbuff_fclone_cache __read_mostly;+static kmem_cache_t *skbuff_head_cache_zerocopy __read_mostly; /* * Keep out-of-line to prevent kernel bloat.@@ -182,6 +183,62 @@ nodata: goto out; } +struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask)+{+ struct sk_buff *skb = NULL;+ void *data;+ int err;+ unsigned int size = SKB_DATA_ALIGN(zb->header_size);+ + zb->status = -1;++ if (size > ZEROCOPY_HEADER_CACHE_SIZE)+ goto out;++ skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA);+ if (!skb)+ goto out;+ + data = kmem_cache_alloc(skbuff_head_cache_zerocopy, gfp_mask & ~__GFP_DMA);+ if (!data)+ goto err_out_free_skb;++ memset(skb, 0, offsetof(struct sk_buff, truesize));+ skb->truesize = size + sizeof(struct sk_buff);+ atomic_set(&skb->users, 1);+ skb->head = data;+ skb->data = data;+ skb->tail = data;+ skb->end = data + size;+ + atomic_set(&(skb_shinfo(skb)->dataref), 1);+ skb_shinfo(skb)->nr_frags = 0;+ skb_shinfo(skb)->tso_size = 0;+ skb_shinfo(skb)->tso_segs = 0;+ skb_shinfo(skb)->frag_list = NULL;+ + skb->ip_summed = CHECKSUM_UNNECESSARY;+ skb->zerocopy = 1;+ /* It could be zerocopied too, but let's use it as is for now. --zbr 2005_10_27 */+ memcpy(skb->data, zb->header, zb->header_size);+ skb_put(skb, zb->header_size);++ zb->skb = skb;++ err = zc_alloc_data(zb);+ if (err)+ goto err_out_free_skb_data;+ +out:+ return skb;+err_out_free_skb_data:+ kmem_cache_free(skbuff_head_cache_zerocopy, data);+err_out_free_skb:+ kmem_cache_free(skbuff_head_cache, skb);+ skb = NULL;+ goto out;+}+ /** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area@@ -284,7 +341,10 @@ void kfree_skbmem(struct sk_buff *skb) struct sk_buff *other; atomic_t *fclone_ref; - skb_release_data(skb);+ if (skb->zerocopy)+ kmem_cache_free(skbuff_head_cache_zerocopy, skb->head);+ else+ skb_release_data(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb);@@ -1706,6 +1766,14 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_fclone_cache) panic("cannot create skbuff cache");+ + skbuff_head_cache_zerocopy = kmem_cache_create("skbuff_head_cache_zerocopy",+ ZEROCOPY_HEADER_CACHE_SIZE + sizeof(struct skb_shared_info),+ 0,+ SLAB_HWCACHE_ALIGN,+ NULL, NULL);+ if (!skbuff_head_cache_zerocopy)+ panic("cannot create zerocopy skbuff cache"); } EXPORT_SYMBOL(___pskb_trim);@@ -1739,3 +1807,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text);+EXPORT_SYMBOL(__alloc_skb_zerocopy);diff --git a/net/core/sock.c b/net/core/sock.c--- a/net/core/sock.c+++ b/net/core/sock.c@@ -129,6 +129,8 @@ #include <net/tcp.h> #endif +#include <net/zerocopy.h>+ /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance@@ -455,6 +457,18 @@ set_rcvbuf: spin_unlock_bh(&sk->sk_lock.slock); ret = -ENONET; break;+ case SO_ZEROCOPY:+ {+ u8 zcdata[256];++ ret = -EINVAL;+ if (optlen > sizeof(zcdata))+ break;+ if (copy_from_user(zcdata, optval, optlen))+ break;+ ret = zc_setup(sock, zcdata, optlen);+ }+ break; /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */@@ -684,6 +698,9 @@ void sk_free(struct sock *sk) if (sk->sk_destruct) sk->sk_destruct(sk); + sk_zc_fini(sk->zsk);+ zc_cleanup(sk->zsk);+ filter = sk->sk_filter; if (filter) { sk_filter_release(sk, filter);diff --git a/net/core/zerocopy.c b/net/core/zerocopy.cnew file mode 100644--- /dev/null+++ b/net/core/zerocopy.c@@ -0,0 +1,530 @@+/*+ * zerocopy.c+ * + * 2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>+ * All rights reserved.+ * + * This program is free software; you can redistribute it and/or modify+ * it under the terms of the GNU General Public License as published by+ * the Free Software Foundation; either version 2 of the License, or+ * (at your option) any later version.+ *+ * This program is distributed in the hope that it will be useful,+ * but WITHOUT ANY WARRANTY; without even the implied warranty of+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the+ * GNU General Public License for more details.+ *+ * You should have received a copy of the GNU General Public License+ * along with this program; if not, write to the Free Software+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA+ */++#include <linux/config.h>+#include <linux/mm.h>+#include <linux/spinlock.h>+#include <linux/list.h>+#include <linux/skbuff.h>+#include <linux/pagemap.h>+#include <linux/swap.h>+#include <linux/writeback.h>+#include <linux/ip.h>+#include <linux/tcp.h>+#include <linux/tcp.h>+#include <linux/udp.h>+#include <linux/fs.h>+#include <linux/file.h>++#include <asm/semaphore.h>++#include <net/inet_hashtables.h>+#include <net/zerocopy.h>++static int tcp_udp_v4_sendfile_alloc_data(struct zc_handler *zh, struct zc_buf *zb);+static int tcp_udp_v4_sendfile_commit_data(struct zc_handler *zh, struct zc_buf *zb);+static int tcp_udp_v4_sendfile_setup(struct zc_handler *zh, struct socket *sock, struct sock_zc_setup_data *p);+static int tcp_udp_v4_sendfile_cleanup(struct zsock *);++#define ZC_HASH_MASK 0xf+static struct zc_sock_bucket tcp_udp_v4_sock_bucket[ZC_HASH_MASK];++struct zc_handler tcp_udp_v4_zc_handler = {+ .alloc_data = &tcp_udp_v4_sendfile_alloc_data,+ .commit_data = &tcp_udp_v4_sendfile_commit_data,+ .setup = &tcp_udp_v4_sendfile_setup,+ .cleanup = &tcp_udp_v4_sendfile_cleanup,+ .sock_bucket = tcp_udp_v4_sock_bucket,+ .sock_bucket_number = ZC_HASH_MASK,+};++static DECLARE_MUTEX(zc_handler_lock);+static LIST_HEAD(zc_handler_list);++int zc_alloc_data(struct zc_buf *zb)+{+ struct zc_handler *zh;+ int err = -ENODEV;++ if (unlikely(zb->size > PAGE_SIZE))+ return err;++ rcu_read_lock();+ list_for_each_entry_rcu(zh, &zc_handler_list, zc_entry) {+ err = zh->alloc_data(zh, zb);+ if (!err) {+ zb->zh = zh;+ break;+ }+ }+ rcu_read_unlock();++ return err;+}++int zc_commit_data(struct zc_buf *zb)+{+ int err = -EINVAL;+ + if (zb->zh)+ err = zb->zh->commit_data(zb->zh, zb);+ + return err;+}++void zc_cleanup(struct zsock *zsk)+{+ if (!zsk)+ return;++ zsk_put(zsk);+}++int zc_setup(struct socket *sock, void *data, unsigned int size)+{+ struct sock_zc_setup_data *p = data;+ int found = 0;+ struct zc_handler *zh;++ if (size <= sizeof(struct sock_zc_setup_data) || + size != htonl(p->size) + sizeof(struct sock_zc_setup_data)) {+ goto err_out_exit;+ }+
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -