📄 zero-copy.9
字号:
+ down(&zc_handler_lock);+ list_for_each_entry(zh, &zc_handler_list, zc_entry) {+ if (!zh->setup(zh, sock, p)) {+ found = 1;+ break;+ }+ }+ up(&zc_handler_lock);++err_out_exit:+ return (found)?0:-ENODEV;+}++int zc_add_handler(struct zc_handler *h)+{+ if (!h->alloc_data || !h->commit_data || !h->sock_bucket || !h->sock_bucket_number || + !h->setup || !h->cleanup)+ return -EINVAL;+ + synchronize_rcu();++ down(&zc_handler_lock);+ list_add_rcu(&h->zc_entry, &zc_handler_list);+ up(&zc_handler_lock);++ return 0;+}++void zc_del_handler(struct zc_handler *h)+{+ synchronize_rcu();+ + down(&zc_handler_lock);+ list_del_rcu(&h->zc_entry);+ up(&zc_handler_lock);+}++extern struct page * __grab_cache_page(struct address_space *mapping, unsigned long index,+ struct page **cached_page, struct pagevec *lru_pvec);++int commit_page(struct zc_page *zp, struct file *file, struct address_space *mapping)+{+ int err;+ struct address_space_operations *a_ops = mapping->a_ops;++ flush_dcache_page(zp->page);+ err = a_ops->commit_write(file, zp->page, zp->page_offset, zp->page_offset+zp->used);+ unlock_page(zp->page);+ mark_page_accessed(zp->page);+ page_cache_release(zp->page);++ if (err < 0)+ goto err_out_exit;++ balance_dirty_pages_ratelimited(mapping);++err_out_exit:+ return err;+}++int prepare_page(struct zc_page *zp, struct zsock *zsk, struct file *file, struct address_space *mapping, + loff_t *ppos, loff_t count, struct pagevec *lru_pvec)+{+ unsigned long index;+ unsigned long page_offset;+ unsigned long bytes;+ struct address_space_operations *a_ops = mapping->a_ops;+ loff_t pos_allocated = *ppos;+ int err = 0;++ page_offset = (pos_allocated & (PAGE_CACHE_SIZE -1));+ index = pos_allocated >> PAGE_CACHE_SHIFT;+ bytes = PAGE_CACHE_SIZE - page_offset;+ if (bytes > count)+ bytes = count;++ zp->page = __grab_cache_page(mapping, index, &zsk->zc_cached_page, lru_pvec);+ if (!zp->page) {+ err = -ENOMEM;+ goto err_out_exit;+ }++ err = a_ops->prepare_write(file, zp->page, page_offset, page_offset+bytes);+ if (unlikely(err)) {+ unlock_page(zp->page);+ page_cache_release(zp->page);+ goto err_out_exit;+ }++ zp->page_offset = page_offset;+ zp->size = bytes;+ zp->used = 0;+ zp->seq = zsk->zc_seq_first + pos_allocated;+ clear_bit(ZC_PAGE_READY, &zp->flags);++ pos_allocated += bytes;++ *ppos = pos_allocated;++err_out_exit:+ return err;+}+++void sk_zc_fini(struct zsock *zsk)+{+ if (zsk) {+ unsigned int zc_page_num;+ struct zc_page *zc_pages;+ unsigned long flags;+ struct sock *sk = NULL;+ + write_lock_irqsave(&zsk->zc_lock, flags);+ zc_page_num = zsk->zc_page_num;+ zc_pages = zsk->zc_pages;+ + zsk->zc_pages = NULL;+ zsk->zc_page_num = 0;+ zsk->zc_page_index = 0;+ zsk->zc_alloc_data = NULL;+ zsk->zc_commit_data = NULL;+ if (zsk->sk) {+ sk = zsk->sk;+ zsk->sk->zsk = NULL;+ zsk->sk = NULL;+ }+ write_unlock_irqrestore(&zsk->zc_lock, flags);++ synchronize_rcu();++ if (zc_page_num) {+ struct address_space *mapping = zsk->zc_file->f_mapping;+ int i;++ if (sk)+ skb_queue_purge(&sk->sk_receive_queue);++ zsk->handler->cleanup(zsk);+ zc_handler_put(zsk->handler);++ /*+ * No new skbs can contribute data into VFS cache after this + * condition, so we only must care about those which are + * in socket queue already or will be inserted there after+ * allocation, but allocation itself will always fail+ * due to above locked changes.+ */++ if (zsk->zc_cached_page) {+ page_cache_release(zsk->zc_cached_page);+ zsk->zc_cached_page = NULL;+ }++ for (i=0; i<zc_page_num; ++i)+ commit_page(&zc_pages[i], zsk->zc_file, mapping);++ zsk->zc_file->f_mode &= ~FMODE_ZEROCOPY;+ fput(zsk->zc_file);+ zsk->zc_file = NULL;+ + kfree(zc_pages);+ }+ }+}++static void sk_zc_init(struct zsock *zsk)+{+ rwlock_init(&zsk->zc_lock);+ init_waitqueue_head(&zsk->zc_data_ready);+ zsk->zc_pages = NULL;+ zsk->zc_page_num = 0;+ zsk->zc_page_index = 0;+ zsk->zc_alloc_data = NULL;+ zsk->zc_commit_data = NULL;+ zsk->zc_file = NULL;+ zsk->zc_cached_page = NULL;+}++struct zsock *zsk_alloc(struct zc_handler *handler, void *priv, unsigned int priv_size, int (* insert)(struct zsock *zsk), gfp_t gfp_mask)+{+ struct zsock *zsk;++ zsk = kzalloc(sizeof(struct zsock) + priv_size, gfp_mask);+ if (!zsk)+ return NULL;++ /* 1 for generic socket usage, i.e. it could be removed from sock_close(). */+ atomic_set(&zsk->refcnt, 1);+ zsk->handler = handler;+ zsk->priv_size = priv_size;+ if (priv_size) {+ zsk->priv = zsk+1;+ memcpy(zsk->priv, priv, priv_size);+ } else+ zsk->priv = NULL;++ zc_handler_get(handler);+ + sk_zc_init(zsk);++ if (insert) {+ int err;++ err = insert(zsk);+ if (err) {+ zc_handler_put(handler);+ zsk_free(zsk);+ return NULL;+ }+ }++ return zsk;+}++void zsk_free(struct zsock *zsk)+{+ kfree(zsk);+}++static inline u32 tcp_udp_v4_hash(unsigned int bucket_number, const u32 src, const u16 sport, const u32 dst, const u16 dport)+{+ return inet_ehashfn(src, sport, dst, dport) & (bucket_number - 1);+}++int tcp_udp_v4_zc_sock_insert(struct zsock *zsk)+{+ u32 hash;+ unsigned long flags;+ struct tcp_udp_v4_priv *priv = zsk_priv(zsk);+ struct zc_sock_bucket *b;++ if (!priv)+ return -ENODEV;+ + hash = tcp_udp_v4_hash(zsk->handler->sock_bucket_number, priv->src, priv->sport, priv->dst, priv->dport);++ b = &zsk->handler->sock_bucket[hash];++ write_lock_irqsave(&b->lock, flags);+ list_add_rcu(&zsk->zc_entry, &b->list);+ write_unlock_irqrestore(&b->lock, flags);++ return 0;+}++int tcp_udp_v4_zc_sock_remove(struct zsock *zsk)+{+ u32 hash;+ unsigned long flags;+ struct tcp_udp_v4_priv *priv = zsk_priv(zsk);+ struct zc_sock_bucket *b;++ if (!priv)+ return -ENODEV;+ + hash = tcp_udp_v4_hash(zsk->handler->sock_bucket_number, priv->src, priv->sport, priv->dst, priv->dport);++ b = &zsk->handler->sock_bucket[hash];++ write_lock_irqsave(&b->lock, flags);+ list_del_rcu(&zsk->zc_entry);+ write_unlock_irqrestore(&b->lock, flags);++ return 0;+}++/*+ * Must be called under RCU cover and with interrupts disabled. + */+static struct zsock *tcp_udp_v4_zc_sock_lookup(const struct zc_sock_bucket *bucket, const unsigned int bucket_number, + const u32 src, const u16 sport, const u32 dst, const u16 dport)+{+ u32 hash = tcp_udp_v4_hash(bucket_number, src, sport, dst, dport);+ struct zsock *zsk;+ struct tcp_udp_v4_priv *priv;++ list_for_each_entry_rcu(zsk, &bucket[hash].list, zc_entry) {+ priv = zsk_priv(zsk);++ if (priv->sport == sport && priv->dport == dport && priv->src == src && priv->dst == dst) {+ zsk_get(zsk);+ return zsk;+ }+ }++ return NULL;+}++static int tcp_udp_v4_sendfile_alloc_data(struct zc_handler *zh, struct zc_buf *zb)+{+ struct ethhdr *eth;+ struct iphdr *iph;+ struct zsock *zsk;+ int err = -EINVAL;+ u16 sport, dport;+ unsigned long flags;+ u32 seq, ack;++ if (zb->header_size < sizeof(struct ethhdr) + sizeof(struct iphdr))+ goto err_out_exit;++ eth = zb->header;++ if (eth->h_proto != htons(ETH_P_IP))+ goto err_out_exit;++ iph = (struct iphdr *)(eth + 1);+ + if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP)+ goto err_out_exit;++ if (iph->protocol == IPPROTO_TCP) {+ struct tcphdr *tcph = (struct tcphdr *)(((u8 *)iph) + iph->ihl*4);+ if (zb->header_size < sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr))+ goto err_out_exit;+ sport = tcph->source;+ dport = tcph->dest;+ seq = ntohl(tcph->seq);+ ack = ntohl(tcph->ack_seq);+ } else {+ struct udphdr *udph = (struct udphdr *)(((u8 *)iph) + iph->ihl*4);+ if (zb->header_size < sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct udphdr))+ goto err_out_exit;+ sport = udph->source;+ dport = udph->dest;+ seq = ack = 0;+ }+ + local_irq_save(flags);+ rcu_read_lock();+ zsk = tcp_udp_v4_zc_sock_lookup(zh->sock_bucket, zh->sock_bucket_number, iph->daddr, dport, iph->saddr, sport);+ if (zsk) {+#if 1+ printk("%s: %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u, seq=%u, ack=%u.\n", + __func__, NIPQUAD(iph->saddr), htons(sport), NIPQUAD(iph->daddr), htons(dport), seq, ack);+#endif++ read_lock(&zsk->zc_lock);+ if (zsk->zc_alloc_data && zsk->zc_pages) {+ zb->priv = zsk;+ err = zsk->zc_alloc_data(zb);+ zb->status = (err)?1:0;+ wake_up(&zsk->zc_data_ready);+ }+ read_unlock(&zsk->zc_lock);+ zsk_put(zsk);+ }+ rcu_read_unlock();+ local_irq_restore(flags);++err_out_exit:+ return err;+}++static int tcp_udp_v4_sendfile_commit_data(struct zc_handler *zh, struct zc_buf *zb)+{+ struct zsock *zsk = zb->priv;+ int err;+ unsigned long flags;++ read_lock_irqsave(&zsk->zc_lock, flags);+ err = zsk->zc_commit_data(zb);+ read_unlock_irqrestore(&zsk->zc_lock, flags);++ wake_up(&zsk->zc_data_ready);++ return err;+}++static int tcp_udp_v4_sendfile_check(struct zc_handler *zh, struct socket *sock, struct sock_zc_setup_data *p)+{+ struct tcp_udp_v4_priv *priv;+ u32 type = ntohl(p->type);+ u32 size = ntohl(p->size);++ if (type != IPPROTO_TCP && type != IPPROTO_UDP)+ return -EINVAL;++ if (size != sizeof(struct tcp_udp_v4_priv))+ return -EINVAL;++ priv = (struct tcp_udp_v4_priv *)p->data;++ return 0;+}++static int tcp_udp_v4_sendfile_setup(struct zc_handler *zh, struct socket *sock, struct sock_zc_setup_data *p)+{+ struct tcp_udp_v4_priv *priv = (struct tcp_udp_v4_priv *)p->data;+ int err;++ err = tcp_udp_v4_sendfile_check(zh, sock, p);+ if (err)+ return err;++ return tcp_udp_v4_sock_zc_init(sock, priv);+}++static int tcp_udp_v4_sendfile_cleanup(struct zsock *zsk)+{+ tcp_udp_v4_zc_sock_remove(zsk);+ return 0;+}++static int zc_add_tcp(void)+{+ int i;++ for (i=0; i<tcp_udp_v4_zc_handler.sock_bucket_number; ++i) {+ INIT_LIST_HEAD(&tcp_udp_v4_zc_handler.sock_bucket[i].list);+ rwlock_init(&tcp_udp_v4_zc_handler.sock_bucket[i].lock);+ }++ atomic_set(&tcp_udp_v4_zc_handler.refcnt, 1);++ return zc_add_handler(&tcp_udp_v4_zc_handler);+}++late_initcall(zc_add_tcp);diff --git a/net/socket.c b/net/socket.c--- a/net/socket.c+++ b/net/socket.c@@ -44,6 +44,7 @@ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent+ * Evgeniy Polyakov: Receiving zero-copy. * * * This program is free software; you can redistribute it and/or@@ -63,6 +64,7 @@ #include <linux/smp_lock.h> #include <linux/socket.h> #include <linux/file.h>+#include <linux/fs.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/netdevice.h>@@ -84,6 +86,11 @@ #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h>+#include <linux/pagemap.h>+#include <linux/swap.h>+#include <linux/writeback.h>+#include <linux/ip.h>+#include <linux/tcp.h> #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */@@ -116,6 +123,7 @@ static ssize_t sock_writev(struct file * unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more);+static ssize_t sock_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void *target); /*@@ -136,7 +144,8 @@ static struct file_operations socket_fil .fasync = sock_fasync, .readv = sock_readv, .writev = sock_writev,- .sendpage = sock_sendpage+ .sendpage = sock_sendpage,+ .sendfile = sock_sendfile, }; /*@@ -726,6 +735,467 @@ static ssize_t sock_aio_write(struct kio return __sock_sendmsg(iocb, sock, &x->async_msg, size);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -