📄 zero-copy.11
字号:
++#define ZC_POOL_SIZE 1024++extern mempool_t *idx_pool;++#define ZC_MAX_IDX 4++enum zc_state {+ ZC_OK = 0,+ ZC_GROW_UP,+ ZC_GROW_DOWN,+ ZC_GROW_BOTH,+ ZC_NEXT,+};++struct zc_index+{+ u16 off;+ u16 size;+};++struct zc_index_list_entry+{+ struct list_head entry;+ struct zc_index idx;+};++struct zc_page+{+ struct page *page;+ unsigned int page_offset;+ unsigned int size;+ unsigned int used;+ u32 seq;+ long flags;+ spinlock_t lock;++ unsigned int idx_num;+ struct zc_index idx[ZC_MAX_IDX];+ struct list_head idx_list;+};++struct zsock+{+ struct list_head zc_entry;+ struct zc_handler *handler;+ atomic_t refcnt;+ struct sock *sk;+ int (* zc_alloc_data)(struct zc_buf *zb);+ int (* zc_commit_data)(struct zc_buf *zb);+ wait_queue_head_t zc_data_ready;+ rwlock_t zc_lock;+ struct zc_page *zc_pages;+ long zc_flags;+ unsigned int zc_page_num, zc_page_index;+ struct pagevec zc_lru_pvec;+ loff_t zc_pos;+ struct page *zc_cached_page;+ struct file *zc_file;+ u32 zc_seq_first;+ void *priv;+ unsigned int priv_size;+};++int sock_zc_setup_seq(struct zsock *zsk, u32 seq);+void sk_zc_fini(struct zsock *zsk);++int zc_setup(struct socket *sk, void *data, unsigned int size);+void zc_cleanup(struct zsock *zsk);++int zc_sock_alloc_data(struct zc_buf *zb);+int zc_sock_commit_data(struct zc_buf *zb);++int zc_alloc_data(struct zc_buf *zb);+int zc_commit_data(struct zc_buf *zb);++struct zsock *zsk_alloc(struct zc_handler *handler, void *priv, unsigned int priv_size, int (* insert)(struct zsock *zsk), gfp_t gfp_mask);+void zsk_free(struct zsock *zsk);++static inline void zc_handler_get(struct zc_handler *zc)+{+ atomic_inc(&zc->refcnt);+}++static inline void zc_handler_put(struct zc_handler *zc)+{+ if (atomic_dec_and_test(&zc->refcnt))+ printk(KERN_DEBUG "Releasing zc=%p.\n", zc);+}++static inline void *zsk_priv(struct zsock *zsk)+{+ return zsk->priv;+}++static inline void zsk_get(struct zsock *zsk)+{+ atomic_inc(&zsk->refcnt);+}++static inline void zsk_put(struct zsock *zsk)+{+ if (atomic_dec_and_test(&zsk->refcnt))+ zsk_free(zsk);+}++int tcp_udp_v4_zc_sock_insert(struct zsock *zsk);+int tcp_udp_v4_sock_zc_init(struct socket *sock, struct tcp_udp_v4_priv *priv);+extern struct zc_handler tcp_udp_v4_zc_handler;++int commit_page(struct zc_page *zp, struct file *file, struct address_space *mapping);+int prepare_page(struct zc_page *zp, struct zsock *zsk, struct file *file, struct address_space *mapping, + loff_t *ppos, loff_t count, struct pagevec *lru_pvec);+#endif /* __KERNEL__ */+#endif /* __ZEROCOPY_H */diff --git a/mm/filemap.c b/mm/filemap.c--- a/mm/filemap.c+++ b/mm/filemap.c@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page); * caller's lru-buffering pagevec. This function is specifically for * generic_file_write(). */-static inline struct page *+struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) {@@ -1692,6 +1692,8 @@ repeat: return page; } +EXPORT_SYMBOL_GPL(__grab_cache_page);+ /* * The logic we want is *diff --git a/net/core/Makefile b/net/core/Makefile--- a/net/core/Makefile+++ b/net/core/Makefile@@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \- gen_stats.o gen_estimator.o+ gen_stats.o gen_estimator.o zerocopy.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o diff --git a/net/core/datagram.c b/net/core/datagram.c--- a/net/core/datagram.c+++ b/net/core/datagram.c@@ -214,6 +214,9 @@ int skb_copy_datagram_iovec(const struct int i, err, fraglen, end = 0; struct sk_buff *next = skb_shinfo(skb)->frag_list; + if (skb->zerocopy)+ return 0;+ if (!len) return 0; @@ -382,6 +385,9 @@ int skb_copy_and_csum_datagram_iovec(str { unsigned int csum; int chunk = skb->len - hlen;+ + if (skb->zerocopy)+ return 0; /* Skip filled elements. * Pretty silly, look at memcpy_toiovec, though 8)diff --git a/net/core/skbuff.c b/net/core/skbuff.c--- a/net/core/skbuff.c+++ b/net/core/skbuff.c@@ -70,6 +70,7 @@ static kmem_cache_t *skbuff_head_cache __read_mostly; static kmem_cache_t *skbuff_fclone_cache __read_mostly;+static kmem_cache_t *skbuff_head_cache_zerocopy __read_mostly; /* * Keep out-of-line to prevent kernel bloat.@@ -186,6 +187,62 @@ nodata: goto out; } +struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask)+{+ struct sk_buff *skb = NULL;+ void *data;+ int err;+ unsigned int size = SKB_DATA_ALIGN(zb->header_size);+ + zb->status = -1;++ if (size > ZEROCOPY_HEADER_CACHE_SIZE)+ goto out;++ skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA);+ if (!skb)+ goto out;+ + data = kmem_cache_alloc(skbuff_head_cache_zerocopy, gfp_mask & ~__GFP_DMA);+ if (!data)+ goto err_out_free_skb;++ memset(skb, 0, offsetof(struct sk_buff, truesize));+ skb->truesize = size + sizeof(struct sk_buff);+ atomic_set(&skb->users, 1);+ skb->head = data;+ skb->data = data;+ skb->tail = data;+ skb->end = data + size;+ + atomic_set(&(skb_shinfo(skb)->dataref), 1);+ skb_shinfo(skb)->nr_frags = 0;+ skb_shinfo(skb)->tso_size = 0;+ skb_shinfo(skb)->tso_segs = 0;+ skb_shinfo(skb)->frag_list = NULL;+ + skb->ip_summed = CHECKSUM_UNNECESSARY;+ skb->zerocopy = 1;+ /* It could be zerocopied too, but let's use it as is for now. --zbr 2005_10_27 */+ memcpy(skb->data, zb->header, zb->header_size);+ skb_put(skb, zb->header_size);++ zb->skb = skb;++ err = zc_alloc_data(zb);+ if (err)+ goto err_out_free_skb_data;+ +out:+ return skb;+err_out_free_skb_data:+ kmem_cache_free(skbuff_head_cache_zerocopy, data);+err_out_free_skb:+ kmem_cache_free(skbuff_head_cache, skb);+ skb = NULL;+ goto out;+}+ /** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area@@ -288,7 +345,10 @@ void kfree_skbmem(struct sk_buff *skb) struct sk_buff *other; atomic_t *fclone_ref; - skb_release_data(skb);+ if (skb->zerocopy)+ kmem_cache_free(skbuff_head_cache_zerocopy, skb->head);+ else+ skb_release_data(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb);@@ -412,6 +472,7 @@ struct sk_buff *skb_clone(struct sk_buff C(priority); C(protocol); n->destructor = NULL;+ n->zerocopy = 0; #ifdef CONFIG_NETFILTER C(nfmark); C(nfct);@@ -477,6 +538,7 @@ static void copy_skb_header(struct sk_bu memcpy(new->cb, old->cb, sizeof(old->cb)); new->local_df = old->local_df; new->fclone = SKB_FCLONE_UNAVAILABLE;+ new->zerocopy = 0; new->pkt_type = old->pkt_type; new->tstamp = old->tstamp; new->destructor = NULL;@@ -1803,6 +1865,14 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_fclone_cache) panic("cannot create skbuff cache");+ + skbuff_head_cache_zerocopy = kmem_cache_create("skbuff_head_cache_zerocopy",+ ZEROCOPY_HEADER_CACHE_SIZE + sizeof(struct skb_shared_info),+ 0,+ SLAB_HWCACHE_ALIGN,+ NULL, NULL);+ if (!skbuff_head_cache_zerocopy)+ panic("cannot create zerocopy skbuff cache"); } EXPORT_SYMBOL(___pskb_trim);@@ -1837,3 +1907,4 @@ EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); EXPORT_SYMBOL(skb_append_datato_frags);+EXPORT_SYMBOL(__alloc_skb_zerocopy);diff --git a/net/core/sock.c b/net/core/sock.c--- a/net/core/sock.c+++ b/net/core/sock.c@@ -129,6 +129,8 @@ #include <net/tcp.h> #endif +#include <net/zerocopy.h>+ /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance@@ -455,6 +457,18 @@ set_rcvbuf: spin_unlock_bh(&sk->sk_lock.slock); ret = -ENONET; break;+ case SO_ZEROCOPY:+ {+ u8 zcdata[256];++ ret = -EINVAL;+ if (optlen > sizeof(zcdata))+ break;+ if (copy_from_user(zcdata, optval, optlen))+ break;+ ret = zc_setup(sock, zcdata, optlen);+ }+ break; /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */@@ -684,6 +698,9 @@ void sk_free(struct sock *sk) if (sk->sk_destruct) sk->sk_destruct(sk); + sk_zc_fini(sk->zsk);+ zc_cleanup(sk->zsk);+ filter = sk->sk_filter; if (filter) { sk_filter_release(sk, filter);diff --git a/net/core/zerocopy.c b/net/core/zerocopy.cnew file mode 100644--- /dev/null+++ b/net/core/zerocopy.c@@ -0,0 +1,601 @@+/*+ * zerocopy.c+ * + * 2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>+ * All rights reserved.+ * + * This program is free software; you can redistribute it and/or modify+ * it under the terms of the GNU General Public License as published by+ * the Free Software Foundation; either version 2 of the License, or+ * (at your option) any later version.+ *+ * This program is distributed in the hope that it will be useful,+ * but WITHOUT ANY WARRANTY; without even the implied warranty of+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the+ * GNU General Public License for more details.+ *+ * You should have received a copy of the GNU General Public License+ * along with this program; if not, write to the Free Software+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA+ */++#include <linux/config.h>+#include <linux/mm.h>+#include <linux/spinlock.h>+#include <linux/list.h>+#include <linux/skbuff.h>+#include <linux/pagemap.h>+#include <linux/swap.h>+#include <linux/writeback.h>+#include <linux/ip.h>+#include <linux/tcp.h>+#include <linux/tcp.h>+#include <linux/udp.h>+#include <linux/fs.h>+#include <linux/file.h>++#include <asm/semaphore.h>++#include <net/inet_hashtables.h>+#include <net/zerocopy.h>++static int tcp_udp_v4_sendfile_alloc_data(struct zc_handler *zh, struct zc_buf *zb);+static int tcp_udp_v4_sendfile_commit_data(struct zc_handler *zh, struct zc_buf *zb);+static int tcp_udp_v4_sendfile_setup(struct zc_handler *zh, struct socket *sock, struct sock_zc_setup_data *p);+static int tcp_udp_v4_sendfile_cleanup(struct zsock *);++#define ZC_HASH_MASK 0xf+static struct zc_sock_bucket tcp_udp_v4_sock_bucket[ZC_HASH_MASK];++struct zc_handler tcp_udp_v4_zc_handler = {+ .alloc_data = &tcp_udp_v4_sendfile_alloc_data,+ .commit_data = &tcp_udp_v4_sendfile_commit_data,+ .setup = &tcp_udp_v4_sendfile_setup,+ .cleanup = &tcp_udp_v4_sendfile_cleanup,+ .sock_bucket = tcp_udp_v4_sock_bucket,+ .sock_bucket_number = ZC_HASH_MASK,+};++static DECLARE_MUTEX(zc_handler_lock);+static LIST_HEAD(zc_handler_list);++static kmem_cache_t *idx_cache;+mempool_t *idx_pool;++static int zc_init(void)+{+ idx_cache = kmem_cache_create("zc_index", sizeof(struct zc_index_list_entry), 0, SLAB_HWCACHE_ALIGN, NULL, NULL);+ if (!idx_cache)+ return -ENOMEM;++ idx_pool = mempool_create(ZC_POOL_SIZE, mempool_alloc_slab, mempool_free_slab, idx_cache);+ if (!idx_pool) {+ kmem_cache_destroy(idx_cache);+ return -ENOMEM;+ }++ return 0;+}++int zc_alloc_data(struct zc_buf *zb)+{+ struct zc_handler *zh;+ int err = -ENODEV;++ if (unlikely(zb->size > PAGE_SIZE))+ return err;++ rcu_read_lock();+ list_for_each_entry_rcu(zh, &zc_handler_list, zc_entry) {+ err = zh->alloc_data(zh, zb);+ if (!err) {+ zb->zh = zh;+ break;+ }+ }+ rcu_read_unlock();++ return err;+}++int zc_commit_data(struct zc_buf *zb)+{+ int err = -EINVAL;+ + if (zb->zh)+ err = zb->zh->commit_data(zb->zh, zb);+ + return err;+}++void zc_cleanup(struct zsock *zsk)+{+ if (!zsk)+ return;++ zsk_put(zsk);+}++int zc_setup(struct socket *sock, void *data, unsigned int size)+{+ struct sock_zc_setup_data *p = data;+ int found = 0;+ struct zc_handler *zh;++ if (size <= sizeof(struct sock_zc_setup_data) || + size != htonl(p->size) + sizeof(struct sock_zc_setup_data)) {+ goto err_out_exit;+ }++ down(&zc_handler_lock);+ list_for_each_entry(zh, &zc_handler_list, zc_entry) {+ if (!zh->setup(zh, sock, p)) {+ found = 1;+ break;+ }+ }+ up(&zc_handler_lock);++err_out_exit:+ return (found)?0:-ENODEV;+}++int zc_add_handler(struct zc_handler *h)+{+ if (!h->alloc_data || !h->commit_data || !h->sock_bucket || !h->sock_bucket_number || + !h->setup || !h->cleanup)+ return -EINVAL;+ + synchronize_rcu();++ down(&zc_handler_lock);+ list_add_rcu(&h->zc_entry, &zc_handler_list);+ up(&zc_handler_lock);++ return 0;+}++void zc_del_handler(struct zc_handler *h)+{+ synchronize_rcu();+ + down(&zc_handler_lock);+ list_del_rcu(&h->zc_entry);+ up(&zc_handler_lock);+}++static inline void zc_clean_page(struct zc_page *zp)+{+ if (likely(zp->idx_num <= ZC_MAX_IDX)) {+ memset(&zp->idx, 0, sizeof(zp->idx));+ } else {+ struct zc_index_list_entry *e, *n;++ list_for_each_entry_safe(e, n, &zp->idx_list, entry) {+ list_del(&e->entry);+ mempool_free(e, idx_pool);+ }+ }++ INIT_LIST_HEAD(&zp->idx_list);+ zp->idx_num = 0;+}++extern struct page * __grab_cache_page(struct address_space *mapping, unsigned long index,+ struct page **cached_page, struct pagevec *lru_pvec);++int commit_page(struct zc_page *zp, struct file *file, struct address_space *mapping)+{+ int err;+ struct address_space_operations *a_ops = mapping->a_ops;++ if (down_interruptible(&mapping->host->i_sem)) {+ err = -EBUSY;+ goto err_out;+ }+ ClearPageReserved(zp->page);+ flush_dcache_page(zp->page);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -