📄 zero-copy.7
字号:
int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk);++ int (* zc_alloc_data)(struct zc_buf *zb);+ int (* zc_commit_data)(struct zc_buf *zb);+ wait_queue_head_t zc_data_ready;+ spinlock_t zc_lock;+ struct zc_page *zc_pages;+ unsigned int zc_page_num, zc_page_index;+ struct pagevec zc_lru_pvec;+ loff_t zc_pos;+ struct page *zc_cached_page;+ struct file *zc_file; }; +void sk_zc_fini(struct sock *sk);+void sk_zc_init(struct sock *sk);+ /* * Hashed lists helper routines */diff --git a/mm/filemap.c b/mm/filemap.c--- a/mm/filemap.c+++ b/mm/filemap.c@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page); * caller's lru-buffering pagevec. This function is specifically for * generic_file_write(). */-static inline struct page *+struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) {@@ -1692,6 +1692,8 @@ repeat: return page; } +EXPORT_SYMBOL_GPL(__grab_cache_page);+ /* * The logic we want is *diff --git a/net/core/Makefile b/net/core/Makefile--- a/net/core/Makefile+++ b/net/core/Makefile@@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \- gen_stats.o gen_estimator.o+ gen_stats.o gen_estimator.o zerocopy.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o diff --git a/net/core/datagram.c b/net/core/datagram.c--- a/net/core/datagram.c+++ b/net/core/datagram.c@@ -213,6 +213,10 @@ int skb_copy_datagram_iovec(const struct { int i, err, fraglen, end = 0; struct sk_buff *next = skb_shinfo(skb)->frag_list;++ if (skb->zerocopy)+ return 0;+ next_skb: fraglen = skb_headlen(skb); i = -1;@@ -364,6 +368,9 @@ int skb_copy_and_csum_datagram_iovec(con { unsigned int csum; int chunk = skb->len - hlen;+ + if (skb->zerocopy)+ return 0; /* Skip filled elements. * Pretty silly, look at memcpy_toiovec, though 8)diff --git a/net/core/skbuff.c b/net/core/skbuff.c--- a/net/core/skbuff.c+++ b/net/core/skbuff.c@@ -70,6 +70,7 @@ static kmem_cache_t *skbuff_head_cache __read_mostly; static kmem_cache_t *skbuff_fclone_cache __read_mostly;+static kmem_cache_t *skbuff_head_cache_zerocopy __read_mostly; /* * Keep out-of-line to prevent kernel bloat.@@ -182,6 +183,64 @@ nodata: goto out; } +int zc_alloc_data(struct zc_buf *zb);+int zc_commit_data(struct zc_buf *zb);++struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask)+{+ struct sk_buff *skb = NULL;+ void *data;+ int err;+ unsigned int size = SKB_DATA_ALIGN(zb->header_size);+ + zb->status = -1;++ if (size > ZEROCOPY_HEADER_CACHE_SIZE)+ goto out;++ skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA);+ if (!skb)+ goto out;+ + data = kmem_cache_alloc(skbuff_head_cache_zerocopy, gfp_mask & ~__GFP_DMA);+ if (!data)+ goto err_out_free_skb;++ memset(skb, 0, offsetof(struct sk_buff, truesize));+ skb->truesize = size + sizeof(struct sk_buff);+ atomic_set(&skb->users, 1);+ skb->head = data;+ skb->data = data;+ skb->tail = data;+ skb->end = data + size;+ + atomic_set(&(skb_shinfo(skb)->dataref), 1);+ skb_shinfo(skb)->nr_frags = 0;+ skb_shinfo(skb)->tso_size = 0;+ skb_shinfo(skb)->tso_segs = 0;+ skb_shinfo(skb)->frag_list = NULL;++ skb->zerocopy = 1;+ /* It could be zerocopied too, but let's use it as is for now. --zbr 2005_10_27 */+ memcpy(skb->data, zb->header, zb->header_size);+ skb_put(skb, zb->header_size);++ zb->skb = skb;++ err = zc_alloc_data(zb);+ if (err)+ goto err_out_free_skb_data;+ +out:+ return skb;+err_out_free_skb_data:+ kmem_cache_free(skbuff_head_cache_zerocopy, data);+err_out_free_skb:+ kmem_cache_free(skbuff_head_cache, skb);+ skb = NULL;+ goto out;+}+ /** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area@@ -284,7 +343,10 @@ void kfree_skbmem(struct sk_buff *skb) struct sk_buff *other; atomic_t *fclone_ref; - skb_release_data(skb);+ if (skb->zerocopy)+ kmem_cache_free(skbuff_head_cache_zerocopy, skb->head);+ else+ skb_release_data(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb);@@ -1706,6 +1768,14 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_fclone_cache) panic("cannot create skbuff cache");+ + skbuff_head_cache_zerocopy = kmem_cache_create("skbuff_head_cache_zerocopy",+ ZEROCOPY_HEADER_CACHE_SIZE + sizeof(struct skb_shared_info),+ 0,+ SLAB_HWCACHE_ALIGN,+ NULL, NULL);+ if (!skbuff_head_cache_zerocopy)+ panic("cannot create zerocopy skbuff cache"); } EXPORT_SYMBOL(___pskb_trim);@@ -1739,3 +1809,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text);+EXPORT_SYMBOL(__alloc_skb_zerocopy);diff --git a/net/core/sock.c b/net/core/sock.c--- a/net/core/sock.c+++ b/net/core/sock.c@@ -455,6 +455,9 @@ set_rcvbuf: spin_unlock_bh(&sk->sk_lock.slock); ret = -ENONET; break;+ case SO_ZEROCOPY:+ ret = sock_zc_init(sock, val);+ break; /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */@@ -660,6 +663,8 @@ struct sock *sk_alloc(int family, gfp_t sock_lock_init(sk); } + sk_zc_init(sk);+ if (security_sk_alloc(sk, family, priority)) goto out_free; @@ -680,6 +685,7 @@ void sk_free(struct sock *sk) { struct sk_filter *filter; struct module *owner = sk->sk_prot_creator->owner;+ unsigned long flags; if (sk->sk_destruct) sk->sk_destruct(sk);@@ -692,6 +698,8 @@ void sk_free(struct sock *sk) sock_disable_timestamp(sk); + sk_zc_fini(sk);+ if (atomic_read(&sk->sk_omem_alloc)) printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", __FUNCTION__, atomic_read(&sk->sk_omem_alloc));@@ -1320,6 +1328,8 @@ void sock_init_data(struct socket *sock, sk->sk_stamp.tv_usec = -1L; atomic_set(&sk->sk_refcnt, 1);++ sk_zc_init(sk); } void fastcall lock_sock(struct sock *sk)diff --git a/net/core/zerocopy.c b/net/core/zerocopy.cnew file mode 100644--- /dev/null+++ b/net/core/zerocopy.c@@ -0,0 +1,195 @@+/*+ * zerocopy.c+ * + * 2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>+ * All rights reserved.+ * + * This program is free software; you can redistribute it and/or modify+ * it under the terms of the GNU General Public License as published by+ * the Free Software Foundation; either version 2 of the License, or+ * (at your option) any later version.+ *+ * This program is distributed in the hope that it will be useful,+ * but WITHOUT ANY WARRANTY; without even the implied warranty of+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the+ * GNU General Public License for more details.+ *+ * You should have received a copy of the GNU General Public License+ * along with this program; if not, write to the Free Software+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA+ */++#include <linux/spinlock.h>+#include <linux/list.h>+#include <linux/skbuff.h>++#include <net/inet_hashtables.h>++static int tcp_sendfile_alloc_data(struct zc_buf *zb);+static int tcp_sendfile_commit_data(struct zc_buf *zb);++static struct zc_handler zc_tcp_sendfile_handler = {+ .alloc_data = &tcp_sendfile_alloc_data,+ .commit_data = &tcp_sendfile_commit_data,+};++static DEFINE_SPINLOCK(zc_lock);+static LIST_HEAD(zc_list);++int zc_alloc_data(struct zc_buf *zb)+{+ struct zc_handler *zh;+ int err = -ENODEV;++ if (unlikely(zb->size > PAGE_SIZE))+ return err;+ + rcu_read_lock();+ list_for_each_entry_rcu(zh, &zc_list, zc_entry) {+ err = zh->alloc_data(zb);+ if (!err) {+ zb->zh = zh;+ break;+ }+ }+ rcu_read_unlock();++ return err;+}++int zc_commit_data(struct zc_buf *zb)+{+ int err = -EINVAL;+ + if (zb->zh)+ err = zb->zh->commit_data(zb);+ + return err;+}++int zc_add_handler(struct zc_handler *h)+{+ if (!h->alloc_data || !h->commit_data)+ return -EINVAL;++ spin_lock(&zc_lock);+ list_add_rcu(&h->zc_entry, &zc_list);+ spin_unlock(&zc_lock);++ return 0;+}++void zc_del_handler(struct zc_handler *h)+{+ spin_lock(&zc_lock);+ list_del_rcu(&h->zc_entry);+ spin_unlock(&zc_lock);++ synchronize_rcu();+}++extern struct inet_hashinfo __cacheline_aligned tcp_hashinfo;++static int tcp_sendfile_alloc_data(struct zc_buf *zb)+{+ struct ethhdr *eth;+ struct iphdr *iph;+ struct tcphdr *tcph;+ struct sock *sk;+ int dif, err = -EINVAL;+ u32 saddr, daddr;+ u16 sport, dport;+ + if (zb->header_size < sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr))+ goto err_out_exit;++ eth = zb->header;++ if (eth->h_proto != htons(ETH_P_IP))+ goto err_out_exit;+ + iph = (struct iphdr *)(eth + 1);++ if (iph->protocol != IPPROTO_TCP)+ goto err_out_exit;+ + tcph = (struct tcphdr *)(((u8 *)iph) + iph->ihl*4);++ dif = 0;+ + saddr = iph->saddr;+ sport = tcph->source;+ daddr = iph->daddr;+ dport = tcph->dest;+ + /*+ * I suspect it is not enough to disable BHs,+ * since it can be [and is] called from hard IRQ context.+ * Must do something with bound devices.+ */+ local_irq_disable();+ rcu_read_lock();+ sk = __inet_lookup(&tcp_hashinfo, saddr, sport, daddr, ntohs(dport), dif);++ if (sk) { + bh_lock_sock(sk);+ + printk("%s: sk=%p, sk->zc_alloc_data=%p, refcnt=%d.\n", __func__, sk, sk->zc_alloc_data, atomic_read(&sk->sk_refcnt));+#if 1+ printk("%s: sk=%p, %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u, seq=%u, ack=%u, check=%04x.\n", + __func__, sk, + NIPQUAD(saddr), ntohs(sport),+ NIPQUAD(daddr), ntohs(dport),+ ntohl(tcph->seq), ntohl(tcph->ack_seq), + ntohs(tcph->check));+#endif + + spin_lock(&sk->zc_lock);+ if (sk->zc_alloc_data && sk->zc_pages) {+ zb->priv = sk;+ err = sk->zc_alloc_data(zb);+ zb->status = (err)?1:0;+ wake_up(&sk->zc_data_ready);+ }+ spin_unlock(&sk->zc_lock);+#if 1+ printk("%s: sk=%p, %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u, seq=%u, ack=%u, check=%04x err=%d, DONE.\n", + __func__, sk, + NIPQUAD(saddr), ntohs(sport),+ NIPQUAD(daddr), ntohs(dport),+ ntohl(tcph->seq), ntohl(tcph->ack_seq), + ntohs(tcph->check), err);+#endif + bh_unlock_sock(sk);+ sock_put(sk);+ }+ rcu_read_unlock();+ local_irq_enable();++err_out_exit:+ return err;+}++static int tcp_sendfile_commit_data(struct zc_buf *zb)+{+ struct sock *sk = zb->priv;+ int err;+ unsigned long flags;++ spin_lock_irqsave(&sk->zc_lock, flags);+ err = sk->zc_commit_data(zb);+ spin_unlock_irqrestore(&sk->zc_lock, flags);+ + wake_up(&sk->zc_data_ready);+ + printk("%s: commiting data, sk=%p, size=%4u, err=%d.\n", __func__, sk, zb->size, err);++ return err;+}++int __init zc_add_tcp(void)+{+ return zc_add_handler(&zc_tcp_sendfile_handler);+}++late_initcall(zc_add_tcp);diff --git a/net/socket.c b/net/socket.c--- a/net/socket.c+++ b/net/socket.c@@ -44,6 +44,7 @@ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent+ * Evgeniy Polyakov: Receiving zero-copy. * * * This program is free software; you can redistribute it and/or@@ -63,6 +64,7 @@ #include <linux/smp_lock.h> #include <linux/socket.h> #include <linux/file.h>+#include <linux/fs.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/netdevice.h>@@ -84,6 +86,9 @@ #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h>+#include <linux/pagemap.h>+#include <linux/swap.h>+#include <linux/writeback.h> #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */@@ -116,6 +121,7 @@ static ssize_t sock_writev(struct file * unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more);+ssize_t sock_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void *target); /*@@ -136,7 +142,8 @@ static struct file_operations socket_fil .fasync = sock_fasync, .readv = sock_readv, .writev = sock_writev,- .sendpage = sock_sendpage
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -