📄 zero-copy.5
字号:
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c--- a/drivers/net/8139too.c+++ b/drivers/net/8139too.c@@ -108,6 +108,10 @@ #include <linux/mii.h> #include <linux/completion.h> #include <linux/crc32.h>+#include <linux/if_ether.h>+#include <linux/ip.h>+#include <linux/tcp.h>+#include <linux/workqueue.h> #include <asm/io.h> #include <asm/uaccess.h> #include <asm/irq.h>@@ -1895,16 +1899,23 @@ static void rtl8139_rx_err (u32 rx_statu } #if RX_BUF_IDX == 3-static __inline__ void wrap_copy(struct sk_buff *skb, const unsigned char *ring,+static __inline__ void __wrap_copy(void *data, const unsigned char *ring, u32 offset, unsigned int size) { u32 left = RX_BUF_LEN - offset; if (size > left) {- memcpy(skb->data, ring + offset, left);- memcpy(skb->data+left, ring, size - left);+ memcpy(data, ring + offset, left);+ memcpy(data+left, ring, size - left); } else- memcpy(skb->data, ring + offset, size);+ memcpy(data, ring + offset, size);++}++static __inline__ void wrap_copy(struct sk_buff *skb, const unsigned char *ring,+ u32 offset, unsigned int size)+{+ __wrap_copy(skb->data, ring, offset, size); } #endif @@ -1926,6 +1937,46 @@ static void rtl8139_isr_ack(struct rtl81 } } +static int rtl8139too_move_data(struct zc_buf *zb, unsigned int sz)+{+ struct rtl8139_private *tp = zb->priv_data;+ unsigned char *rx_ring = tp->rx_ring;+ unsigned int cur_rx = tp->cur_rx;+ u32 ring_offset = cur_rx % RX_BUF_LEN;+ int hsize = sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr) + 2;+ struct sk_buff *skb = zb->skb;+ skb_frag_t *frag;+ void *dest;+ + if (skb_shinfo(skb)->nr_frags == 0 || skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) {+ printk(KERN_ERR "%s: broken skb: zb=%p, nr_frags=%u.\n", __func__, zb, skb_shinfo(skb)->nr_frags);+ return -EINVAL;+ }+ + frag = &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags-1];+ dest = page_address(frag->page) + frag->page_offset;++ printk("%s: zb=%p, page=%p, offset=%4u, size=%4u, dest=%p, sz=%4u.\n", + __func__, zb, frag->page, frag->page_offset, frag->size, dest, sz);++#if RX_BUF_IDX == 3+ wrap_copy(dest, rx_ring, ring_offset + 4 + hsize, sz);+#else+ memcpy(dest, &rx_ring[ring_offset + 4 + hsize], sz);+#endif++ return sz;+}++static void rtl8139_work_func(void *data)+{+ struct sk_buff *skb = data;+ + netif_receive_skb(skb);+}++static DECLARE_WORK(rtl8139_work, &rtl8139_work_func, NULL);+ static int rtl8139_rx(struct net_device *dev, struct rtl8139_private *tp, int budget) {@@ -1956,8 +2007,7 @@ static int rtl8139_rx(struct net_device if (netif_msg_rx_status(tp)) printk(KERN_DEBUG "%s: rtl8139_rx() status %4.4x, size %4.4x,"- " cur %4.4x.\n", dev->name, rx_status,- rx_size, cur_rx);+ " cur %4.4x.\n", dev->name, rx_status, rx_size, cur_rx); #if RTL8139_DEBUG > 2 { int i;@@ -2005,34 +2055,74 @@ no_early_rx: goto out; } - /* Malloc up new buffer, compatible with net-2e. */- /* Omit the four octet CRC from the length. */+ {+ u8 zc_data[sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr) + 2 + sizeof(struct zc_buf)];+ int hsize = sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr) + 2;+ struct zc_buf *zb;++ memset(&zc_data, 0, sizeof(zc_data));+ zb = (struct zc_buf *)zc_data;++ zb->header = (void *)(zb + 1);+ zb->header_size = hsize - 2;+ zb->size = pkt_size + 2 - hsize;+ zb->priv_data = tp;+ zb->move_data = &rtl8139too_move_data; - skb = dev_alloc_skb (pkt_size + 2);- if (likely(skb)) {- skb->dev = dev;- skb_reserve (skb, 2); /* 16 byte align the IP fields. */ #if RX_BUF_IDX == 3- wrap_copy(skb, rx_ring, ring_offset+4, pkt_size);+ __wrap_copy(zb->header, rx_ring, ring_offset+4, hsize); #else- eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size, 0);+ memcpy(zb->header, &rx_ring[ring_offset + 4], hsize); #endif- skb_put (skb, pkt_size);-- skb->protocol = eth_type_trans (skb, dev);+ skb = alloc_skb_zerocopy(zb, GFP_ATOMIC);+ if (skb) {+ skb->dev = dev;+ skb->protocol = eth_type_trans(skb, dev);+ + dev->last_rx = jiffies;+ tp->stats.rx_bytes += pkt_size;+ tp->stats.rx_packets++;+ netif_receive_skb(skb);+ //rtl8139_work.data = skb;+ //schedule_work(&rtl8139_work);+ } else if (zb->status == -1) {+ /* Malloc up new buffer, compatible with net-2e. */+ /* Omit the four octet CRC from the length. */+ + skb = dev_alloc_skb (pkt_size + 2);+ if (likely(skb)) {+ skb->dev = dev;+ skb_reserve (skb, 2); /* 16 byte align the IP fields. */+#if RX_BUF_IDX == 3+ wrap_copy(skb, rx_ring, ring_offset+4, pkt_size);+#else+ eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size, 0);+#endif+ skb_put (skb, pkt_size); - dev->last_rx = jiffies;- tp->stats.rx_bytes += pkt_size;- tp->stats.rx_packets++;+ skb->protocol = eth_type_trans (skb, dev); - netif_receive_skb (skb);- } else {- if (net_ratelimit()) - printk (KERN_WARNING- "%s: Memory squeeze, dropping packet.\n",- dev->name);- tp->stats.rx_dropped++;+ dev->last_rx = jiffies;+ tp->stats.rx_bytes += pkt_size;+ tp->stats.rx_packets++;++ netif_receive_skb(skb);+ } else {+ if (net_ratelimit()) + printk (KERN_WARNING+ "%s: Memory squeeze, dropping packet.\n",+ dev->name);+ tp->stats.rx_dropped++;+ }+ } else {+ if (net_ratelimit()) + printk (KERN_WARNING+ "%s: Zero-copy failed, dropping packet.\n",+ dev->name);+ tp->stats.rx_dropped++;+ } }+ received++; cur_rx = (cur_rx + rx_size + 4 + 3) & ~3;diff --git a/fs/read_write.c b/fs/read_write.c--- a/fs/read_write.c+++ b/fs/read_write.c@@ -15,6 +15,8 @@ #include <linux/module.h> #include <linux/syscalls.h> +#include <net/sock.h>+ #include <asm/uaccess.h> #include <asm/unistd.h> @@ -670,8 +672,15 @@ static ssize_t do_sendfile(int out_fd, i if (!(out_file->f_mode & FMODE_WRITE)) goto fput_out; retval = -EINVAL;- if (!out_file->f_op || !out_file->f_op->sendpage)+ if (!out_file->f_op)+ goto fput_out;+ + if (!SOCKET_I(in_file->f_dentry->d_inode) && !out_file->f_op->sendpage) {+ printk("%s: sock=%p, sendpage=%p.\n", __func__, + SOCKET_I(in_file->f_dentry->d_inode), out_file->f_op->sendpage); goto fput_out;+ }+ out_inode = out_file->f_dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); if (retval)@@ -688,7 +697,7 @@ static ssize_t do_sendfile(int out_fd, i retval = -EINVAL; if (unlikely(pos < 0)) goto fput_out;- if (unlikely(pos + count > max)) {+ if (unlikely((unsigned long long)(pos + count) > (unsigned long long)max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out;diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h--- a/include/linux/skbuff.h+++ b/include/linux/skbuff.h@@ -34,6 +34,8 @@ #define HAVE_ALIGNABLE_SKB /* Ditto 8) */ #define SLAB_SKB /* Slabified skbuffs */ +#define ZEROCOPY_HEADER_CACHE_SIZE 256 /* Maximum receiving zero-copy header size */+ #define CHECKSUM_NONE 0 #define CHECKSUM_HW 1 #define CHECKSUM_UNNECESSARY 2@@ -261,7 +263,8 @@ struct sk_buff { nohdr:1, nfctinfo:3; __u8 pkt_type:3,- fclone:2;+ fclone:2,+ zerocopy:1; __be16 protocol; void (*destructor)(struct sk_buff *skb);@@ -1045,6 +1048,36 @@ static inline struct sk_buff *dev_alloc_ return __dev_alloc_skb(length, GFP_ATOMIC); } +struct zc_buf;++struct zc_handler+{+ struct list_head zc_entry;+ int (* alloc_data)(struct zc_buf *zb);+ int (* commit_data)(struct zc_buf *zb);+};++struct zc_buf+{+ struct zc_handler *zh;+ void *header;+ unsigned int header_size;+ unsigned int size;+ void *priv;+ int status;+ struct sk_buff *skb;+ int (* move_data)(struct zc_buf *zb, unsigned int sz);+ void *priv_data;+};+++extern struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask);++static inline struct sk_buff *alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask)+{+ return __alloc_skb_zerocopy(zb, gfp_mask);+}+ /** * skb_cow - copy header of skb when it is required * @skb: buffer to cowdiff --git a/include/net/sock.h b/include/net/sock.h--- a/include/net/sock.h+++ b/include/net/sock.h@@ -117,6 +117,20 @@ struct sock_common { struct proto *skc_prot; }; +enum zc_flags {+ ZC_PAGE_READY = 0,+};++struct zc_page+{+ struct page *page;+ struct page *cached_page;+ unsigned int page_offset;+ unsigned int size;+ unsigned int used;+ long flags;+};+ /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock@@ -251,6 +265,13 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk);++ int (* zc_alloc_data)(struct zc_buf *zb);+ int (* zc_commit_data)(struct zc_buf *zb);+ wait_queue_head_t zc_data_ready;+ spinlock_t zc_lock;+ struct zc_page *zc_pages;+ unsigned int zc_page_num, zc_page_index; }; /*diff --git a/mm/filemap.c b/mm/filemap.c--- a/mm/filemap.c+++ b/mm/filemap.c@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page); * caller's lru-buffering pagevec. This function is specifically for * generic_file_write(). */-static inline struct page *+struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) {@@ -1692,6 +1692,8 @@ repeat: return page; } +EXPORT_SYMBOL_GPL(__grab_cache_page);+ /* * The logic we want is *diff --git a/net/core/Makefile b/net/core/Makefile--- a/net/core/Makefile+++ b/net/core/Makefile@@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \- gen_stats.o gen_estimator.o+ gen_stats.o gen_estimator.o zerocopy.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o diff --git a/net/core/datagram.c b/net/core/datagram.c--- a/net/core/datagram.c+++ b/net/core/datagram.c@@ -213,6 +213,10 @@ int skb_copy_datagram_iovec(const struct { int i, err, fraglen, end = 0; struct sk_buff *next = skb_shinfo(skb)->frag_list;++ if (skb->zerocopy)+ return 0;+ next_skb: fraglen = skb_headlen(skb); i = -1;diff --git a/net/core/skbuff.c b/net/core/skbuff.c--- a/net/core/skbuff.c+++ b/net/core/skbuff.c@@ -70,6 +70,7 @@ static kmem_cache_t *skbuff_head_cache __read_mostly; static kmem_cache_t *skbuff_fclone_cache __read_mostly;+static kmem_cache_t *skbuff_head_cache_zerocopy __read_mostly; /* * Keep out-of-line to prevent kernel bloat.@@ -182,6 +183,66 @@ nodata: goto out; } +int zc_alloc_data(struct zc_buf *zb);+int zc_commit_data(struct zc_buf *zb);++struct sk_buff *__alloc_skb_zerocopy(struct zc_buf *zb, gfp_t gfp_mask)+{+ struct sk_buff *skb = NULL;+ void *data;+ int err;+ unsigned int size = SKB_DATA_ALIGN(zb->header_size);++ if (size > ZEROCOPY_HEADER_CACHE_SIZE)+ goto out;+ + zb->status = -1;++ skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA);+ if (!skb)+ goto out;+ + data = kmem_cache_alloc(skbuff_head_cache_zerocopy, gfp_mask & ~__GFP_DMA);+ if (!data)+ goto err_out_free_skb;++ memset(skb, 0, offsetof(struct sk_buff, truesize));+ skb->truesize = size + sizeof(struct sk_buff);+ atomic_set(&skb->users, 1);+ skb->head = data;+ skb->data = data;+ skb->tail = data;+ skb->end = data + size;+ + atomic_set(&(skb_shinfo(skb)->dataref), 1);+ skb_shinfo(skb)->nr_frags = 0;+ skb_shinfo(skb)->tso_size = 0;+ skb_shinfo(skb)->tso_segs = 0;+ skb_shinfo(skb)->frag_list = NULL;++ skb->zerocopy = 1;+ /* It could be zerocopied too, but let's use it as is for now. --zbr 2005_10_27 */+ memcpy(skb->data, zb->header, zb->header_size);+ skb_put(skb, zb->header_size);++ zb->skb = skb;++ err = zc_alloc_data(zb);+ if (err)+ goto err_out_free_skb_data;++ printk("%s: skb=%p, skb->len=%u, skb->data_len=%u.\n", __func__, skb, skb->len, skb->data_len);+ +out:+ return skb;+err_out_free_skb_data:+ kmem_cache_free(skbuff_head_cache_zerocopy, data);+err_out_free_skb:+ kmem_cache_free(skbuff_head_cache, skb);+ skb = NULL;+ goto out;+}+ /** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area@@ -284,7 +345,10 @@ void kfree_skbmem(struct sk_buff *skb) struct sk_buff *other; atomic_t *fclone_ref; - skb_release_data(skb);+ if (skb->zerocopy)+ kmem_cache_free(skbuff_head_cache_zerocopy, skb->head);+ else+ skb_release_data(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb);@@ -1706,6 +1770,14 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_fclone_cache) panic("cannot create skbuff cache");+ + skbuff_head_cache_zerocopy = kmem_cache_create("skbuff_head_cache_zerocopy",+ ZEROCOPY_HEADER_CACHE_SIZE + sizeof(struct skb_shared_info),+ 0,+ SLAB_HWCACHE_ALIGN,+ NULL, NULL);+ if (!skbuff_head_cache_zerocopy)+ panic("cannot create zerocopy skbuff cache"); } EXPORT_SYMBOL(___pskb_trim);@@ -1739,3 +1811,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text);+EXPORT_SYMBOL(__alloc_skb_zerocopy);diff --git a/net/core/sock.c b/net/core/sock.c--- a/net/core/sock.c+++ b/net/core/sock.c@@ -704,6 +704,18 @@ void sk_free(struct sock *sk) module_put(owner); } +static void zc_sk_init(struct sock *sk)+{+ spin_lock_init(&sk->zc_lock);+ init_waitqueue_head(&sk->zc_data_ready);+ sk->zc_pages = NULL;+ sk->zc_page_num = 0;+ sk->zc_page_index = 0;+ sk->zc_alloc_data = NULL;+ sk->zc_commit_data = NULL;+}++ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);@@ -737,6 +749,8 @@ struct sock *sk_clone(const struct sock sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); + zc_sk_init(newsk);+ filter = newsk->sk_filter; if (filter != NULL) sk_filter_charge(newsk, filter);@@ -1320,6 +1334,8 @@ void sock_init_data(struct socket *sock, sk->sk_stamp.tv_usec = -1L; atomic_set(&sk->sk_refcnt, 1);++ zc_sk_init(sk); } void fastcall lock_sock(struct sock *sk)diff --git a/net/core/zerocopy.c b/net/core/zerocopy.cnew file mode 100644--- /dev/null+++ b/net/core/zerocopy.c@@ -0,0 +1,165 @@+/*+ * zerocopy.c+ * + * 2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>+ * All rights reserved.+ * + * This program is free software; you can redistribute it and/or modify+ * it under the terms of the GNU General Public License as published by+ * the Free Software Foundation; either version 2 of the License, or+ * (at your option) any later version.+ *+ * This program is distributed in the hope that it will be useful,+ * but WITHOUT ANY WARRANTY; without even the implied warranty of+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the+ * GNU General Public License for more details.+ *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -