📄 zero-copy.6
字号:
+ int (* zc_alloc_data)(struct zc_buf *zb);+ int (* zc_commit_data)(struct zc_buf *zb);+ wait_queue_head_t zc_data_ready;+ spinlock_t zc_lock;+ struct zc_page *zc_pages;+ unsigned int zc_page_num, zc_page_index;+ unsigned int zc_users; }; /*diff --git a/mm/filemap.c b/mm/filemap.c--- a/mm/filemap.c+++ b/mm/filemap.c@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page); * caller's lru-buffering pagevec. This function is specifically for * generic_file_write(). */-static inline struct page *+struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) {@@ -1692,6 +1692,8 @@ repeat: return page; } +EXPORT_SYMBOL_GPL(__grab_cache_page);+ /* * The logic we want is *diff --git a/net/core/Makefile b/net/core/Makefile--- a/net/core/Makefile+++ b/net/core/Makefile@@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \- gen_stats.o gen_estimator.o+ gen_stats.o gen_estimator.o zerocopy.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o diff --git a/net/core/datagram.c b/net/core/datagram.c--- a/net/core/datagram.c+++ b/net/core/datagram.c@@ -213,6 +213,10 @@ int skb_copy_datagram_iovec(const struct { int i, err, fraglen, end = 0; struct sk_buff *next = skb_shinfo(skb)->frag_list;++ if (skb->zerocopy)+ return 0;+ next_skb: fraglen = skb_headlen(skb); i = -1;@@ -364,6 +368,9 @@ int skb_copy_and_csum_datagram_iovec(con { unsigned int csum; int chunk = skb->len - hlen;+ + if (skb->zerocopy)+ return 0; /* Skip filled elements. * Pretty silly, look at memcpy_toiovec, though 8)diff --git a/net/core/skbuff.c b/net/core/skbuff.c--- a/net/core/skbuff.c+++ b/net/core/skbuff.c@@ -70,6 +70,7 @@ static kmem_cache_t *skbuff_head_cache __read_mostly; static kmem_cache_t *skbuff_fclone_cache __read_mostly;+static kmem_cache_t *skbuff_head_cache_zerocopy __read_mostly; /* * Keep out-of-line to prevent kernel bloat.@@ -182,6 +183,64 @@ nodata: goto out; } +int zc_alloc_data(struct zc_buf *zb);+int zc_commit_data(struct zc_buf *zb);++struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask)+{+ struct sk_buff *skb = NULL;+ void *data;+ int err;+ unsigned int size = SKB_DATA_ALIGN(zb->header_size);++ if (size > ZEROCOPY_HEADER_CACHE_SIZE)+ goto out;+ + zb->status = -1;++ skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA);+ if (!skb)+ goto out;+ + data = kmem_cache_alloc(skbuff_head_cache_zerocopy, gfp_mask & ~__GFP_DMA);+ if (!data)+ goto err_out_free_skb;++ memset(skb, 0, offsetof(struct sk_buff, truesize));+ skb->truesize = size + sizeof(struct sk_buff);+ atomic_set(&skb->users, 1);+ skb->head = data;+ skb->data = data;+ skb->tail = data;+ skb->end = data + size;+ + atomic_set(&(skb_shinfo(skb)->dataref), 1);+ skb_shinfo(skb)->nr_frags = 0;+ skb_shinfo(skb)->tso_size = 0;+ skb_shinfo(skb)->tso_segs = 0;+ skb_shinfo(skb)->frag_list = NULL;++ skb->zerocopy = 1;+ /* It could be zerocopied too, but let's use it as is for now. --zbr 2005_10_27 */+ memcpy(skb->data, zb->header, zb->header_size);+ skb_put(skb, zb->header_size);++ zb->skb = skb;++ err = zc_alloc_data(zb);+ if (err)+ goto err_out_free_skb_data;+ +out:+ return skb;+err_out_free_skb_data:+ kmem_cache_free(skbuff_head_cache_zerocopy, data);+err_out_free_skb:+ kmem_cache_free(skbuff_head_cache, skb);+ skb = NULL;+ goto out;+}+ /** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area@@ -284,7 +343,10 @@ void kfree_skbmem(struct sk_buff *skb) struct sk_buff *other; atomic_t *fclone_ref; - skb_release_data(skb);+ if (skb->zerocopy)+ kmem_cache_free(skbuff_head_cache_zerocopy, skb->head);+ else+ skb_release_data(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb);@@ -1706,6 +1768,14 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_fclone_cache) panic("cannot create skbuff cache");+ + skbuff_head_cache_zerocopy = kmem_cache_create("skbuff_head_cache_zerocopy",+ ZEROCOPY_HEADER_CACHE_SIZE + sizeof(struct skb_shared_info),+ 0,+ SLAB_HWCACHE_ALIGN,+ NULL, NULL);+ if (!skbuff_head_cache_zerocopy)+ panic("cannot create zerocopy skbuff cache"); } EXPORT_SYMBOL(___pskb_trim);@@ -1739,3 +1809,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text);+EXPORT_SYMBOL(__alloc_skb_zerocopy);diff --git a/net/core/sock.c b/net/core/sock.c--- a/net/core/sock.c+++ b/net/core/sock.c@@ -704,6 +704,18 @@ void sk_free(struct sock *sk) module_put(owner); } +static void zc_sk_init(struct sock *sk)+{+ spin_lock_init(&sk->zc_lock);+ init_waitqueue_head(&sk->zc_data_ready);+ sk->zc_pages = NULL;+ sk->zc_page_num = 0;+ sk->zc_page_index = 0;+ sk->zc_alloc_data = NULL;+ sk->zc_commit_data = NULL;+}++ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);@@ -737,6 +749,8 @@ struct sock *sk_clone(const struct sock sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); + zc_sk_init(newsk);+ filter = newsk->sk_filter; if (filter != NULL) sk_filter_charge(newsk, filter);@@ -1320,6 +1334,8 @@ void sock_init_data(struct socket *sock, sk->sk_stamp.tv_usec = -1L; atomic_set(&sk->sk_refcnt, 1);++ zc_sk_init(sk); } void fastcall lock_sock(struct sock *sk)diff --git a/net/core/zerocopy.c b/net/core/zerocopy.cnew file mode 100644--- /dev/null+++ b/net/core/zerocopy.c@@ -0,0 +1,165 @@+/*+ * zerocopy.c+ * + * 2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>+ * All rights reserved.+ * + * This program is free software; you can redistribute it and/or modify+ * it under the terms of the GNU General Public License as published by+ * the Free Software Foundation; either version 2 of the License, or+ * (at your option) any later version.+ *+ * This program is distributed in the hope that it will be useful,+ * but WITHOUT ANY WARRANTY; without even the implied warranty of+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the+ * GNU General Public License for more details.+ *+ * You should have received a copy of the GNU General Public License+ * along with this program; if not, write to the Free Software+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA+ */++#include <linux/spinlock.h>+#include <linux/list.h>+#include <linux/skbuff.h>++#include <net/inet_hashtables.h>++static int tcp_sendfile_alloc_data(struct zc_buf *zb);+static int tcp_sendfile_commit_data(struct zc_buf *zb);++static struct zc_handler zc_tcp_sendfile_handler = {+ .alloc_data = &tcp_sendfile_alloc_data,+ .commit_data = &tcp_sendfile_commit_data,+};++static DEFINE_SPINLOCK(zc_lock);+static LIST_HEAD(zc_list);++int zc_alloc_data(struct zc_buf *zb)+{+ struct zc_handler *zh;+ int err = -ENODEV;++ if (unlikely(zb->size > PAGE_SIZE))+ return err;+ + rcu_read_lock();+ list_for_each_entry_rcu(zh, &zc_list, zc_entry) {+ err = zh->alloc_data(zb);+ if (!err) {+ zb->zh = zh;+ break;+ }+ }+ rcu_read_unlock();++ return err;+}++int zc_commit_data(struct zc_buf *zb)+{+ int err = -EINVAL;+ + if (zb->zh)+ err = zb->zh->commit_data(zb);+ + return err;+}++int zc_add_handler(struct zc_handler *h)+{+ if (!h->alloc_data || !h->commit_data)+ return -EINVAL;++ spin_lock(&zc_lock);+ list_add_rcu(&h->zc_entry, &zc_list);+ spin_unlock(&zc_lock);++ return 0;+}++void zc_del_handler(struct zc_handler *h)+{+ spin_lock(&zc_lock);+ list_del_rcu(&h->zc_entry);+ spin_unlock(&zc_lock);++ synchronize_rcu();+}++extern struct inet_hashinfo __cacheline_aligned tcp_hashinfo;++static int tcp_sendfile_alloc_data(struct zc_buf *zb)+{+ struct ethhdr *eth;+ struct iphdr *iph;+ struct tcphdr *tcph;+ struct sock *sk;+ int dif, err = -EINVAL;+ u32 saddr, daddr;+ u16 sport, dport;+ + if (zb->header_size < sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr))+ goto err_out_exit;++ eth = zb->header;++ if (eth->h_proto != htons(ETH_P_IP))+ goto err_out_exit;+ + iph = (struct iphdr *)(eth + 1);++ if (iph->protocol != IPPROTO_TCP)+ goto err_out_exit;+ + tcph = (struct tcphdr *)(iph + 1);++ dif = 0;+ + saddr = iph->saddr;+ sport = tcph->source;+ daddr = iph->daddr;+ dport = tcph->dest;+ + /*+ * I suspect it is not enough to disable BHs,+ * since it can be [and is] called from hard IRQ context.+ * Must do something with bound devices.+ */+ sk = inet_lookup(&tcp_hashinfo, saddr, sport, daddr, dport, dif);++ if (sk && sk->zc_alloc_data) {+ zb->priv = sk;+ err = sk->zc_alloc_data(zb);+ zb->status = (err)?1:0;+ }+#if 0+ printk("%s: sk=%p, %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u, data=%p, status=%d, err=%d.\n", + __func__, sk, + NIPQUAD(saddr), ntohs(sport),+ NIPQUAD(daddr), ntohs(dport),+ zb->skb->data, zb->status, err);+#endif +err_out_exit:+ return err;+}++static int tcp_sendfile_commit_data(struct zc_buf *zb)+{+ struct sock *sk = zb->priv;+ int err;++ err = sk->zc_commit_data(zb);+ + printk("%s: commiting data, sk=%p, size=%4u, err=%d.\n", __func__, sk, zb->size, err);++ return err;+}++int __init zc_add_tcp(void)+{+ return zc_add_handler(&zc_tcp_sendfile_handler);+}++late_initcall(zc_add_tcp);diff --git a/net/socket.c b/net/socket.c--- a/net/socket.c+++ b/net/socket.c@@ -44,6 +44,7 @@ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent+ * Evgeniy Polyakov: Added sock_sendfile(). * * * This program is free software; you can redistribute it and/or@@ -84,6 +85,10 @@ #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h>+#include <linux/pagevec.h>+#include <linux/pagemap.h>+#include <linux/swap.h>+#include <linux/writeback.h> #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */@@ -116,6 +121,7 @@ static ssize_t sock_writev(struct file * unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -