📄 zero-copy.3
字号:
+ void *data = NULL;++ if (unlikely(size > PAGE_SIZE))+ return NULL;+ + rcu_read_lock();+ list_for_each_entry_rcu(zh, &zc_list, zc_entry) {+ data = zh->alloc_data(zh, header, header_size, size, priv, status);+ if (data) {+ *__zh = zh;+ break;+ }+ }+ rcu_read_unlock();++ return data;+}++void zc_commit_data(void *header, unsigned int header_size, unsigned int size, void *priv, struct zc_handler *zh)+{+ if (zh)+ zh->commit_data(zh, header, header_size, size, priv);+}++int zc_add_handler(struct zc_handler *h)+{+ if (!h->alloc_data || !h->commit_data)+ return -EINVAL;++ spin_lock(&zc_lock);+ list_add_rcu(&h->zc_entry, &zc_list);+ spin_unlock(&zc_lock);++ return 0;+}++void zc_del_handler(struct zc_handler *h)+{+ spin_lock(&zc_lock);+ list_del_rcu(&h->zc_entry);+ spin_unlock(&zc_lock);++ synchronize_rcu();+}++extern struct inet_hashinfo __cacheline_aligned tcp_hashinfo;++static void *tcp_sendfile_alloc_data(struct zc_handler *zh, void *header, unsigned int header_size, unsigned int size, void **priv, int *status)+{+ struct ethhdr *eth;+ struct iphdr *iph;+ struct tcphdr *tcph;+ struct sock *sk;+ void *data = NULL;+ int dif, need_exit;+ u32 saddr, daddr;+ u16 sport, dport;+ + if (header_size < sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr)) {+ printk("%s: wrong size %u, must be %zu.\n", + __func__, header_size, sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr));+ return NULL;+ }++ eth = header;+ iph = (struct iphdr *)(eth + 1);++ if (iph->protocol != IPPROTO_TCP)+ return NULL;+ + tcph = (struct tcphdr *)(iph + 1);++ dif = 0;+ + saddr = iph->saddr;+ sport = tcph->source;+ daddr = iph->daddr;+ dport = tcph->dest;+ need_exit = 0;+ + while (1) {+ /*+ * I suspect it is not enough to disable BHs,+ * since it can be [and is] called from hard IRQ context.+ * Must do something with bound devices + */+ sk = inet_lookup(&tcp_hashinfo, saddr, sport, daddr, dport, dif);++ if (sk && sk->zc_alloc_data) {+ *priv = sk;+ data = sk->zc_alloc_data(size, sk);+ *status = (data)?0:1;+ need_exit = 1;+ }+ printk("%s: sk=%p, %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u, data=%p, status=%d.\n", + __func__, sk, + NIPQUAD(saddr), ntohs(sport),+ NIPQUAD(daddr), ntohs(dport),+ data, *status);++ if (need_exit)+ break;+ + if (!sk || !sk->zc_alloc_data) {+ daddr = iph->saddr;+ dport = tcph->source;+ saddr = iph->daddr;+ sport = tcph->dest;+ need_exit = 1;+ }+ };+ + return data;+}++static void tcp_sendfile_commit_data(struct zc_handler *zh, void *header, unsigned int header_size, unsigned int size, void *priv)+{+ struct sock *sk = priv;++ sk->zc_commit_data(size, sk);+}++int __init zc_add_tcp(void)+{+ return zc_add_handler(&zc_tcp_sendfile_handler);+}++late_initcall(zc_add_tcp);diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.cdiff --git a/net/socket.c b/net/socket.c--- a/net/socket.c+++ b/net/socket.c@@ -44,6 +44,7 @@ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent+ * Evgeniy Polyakov: Added sock_sendfile(). * * * This program is free software; you can redistribute it and/or@@ -84,6 +85,10 @@ #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h>+#include <linux/pagevec.h>+#include <linux/pagemap.h>+#include <linux/swap.h>+#include <linux/writeback.h> #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */@@ -116,6 +121,7 @@ static ssize_t sock_writev(struct file * unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more);+ssize_t sock_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void *target); /*@@ -136,7 +142,8 @@ static struct file_operations socket_fil .fasync = sock_fasync, .readv = sock_readv, .writev = sock_writev,- .sendpage = sock_sendpage+ .sendpage = sock_sendpage,+ .sendfile = sock_sendfile, }; /*@@ -726,6 +733,257 @@ static ssize_t sock_aio_write(struct kio return __sock_sendmsg(iocb, sock, &x->async_msg, size); } +void *zc_sock_alloc_data(unsigned int size, void *priv)+{+ struct sock *sk = priv;+ void *data = NULL;+ unsigned long flags;+ struct zc_page *zp;++ if (!sk || !sk->zc_page_num)+ goto out;++ spin_lock_irqsave(&sk->zc_lock, flags);+ zp = &sk->zc_pages[sk->zc_page_index];+ if (zp->size == zp->used || test_bit(ZC_PAGE_READY, &zp->flags)) {+ unsigned int index = sk->zc_page_index + 1;++ BUG_ON(index > sk->zc_page_num);++ if (index == sk->zc_page_num)+ index = 0;+ zp = &sk->zc_pages[index];+ if (zp->size == zp->used || test_bit(ZC_PAGE_READY, &zp->flags))+ goto out_unlock;+ }+ if (zp->size - zp->used < size)+ goto out_unlock;++ data = page_address(zp->page) + zp->page_offset;+ zp->used += size;++out_unlock:+ spin_unlock_irqrestore(&sk->zc_lock, flags);+out:+ return data;+}++int zc_sock_commit_data(unsigned int size, void *priv)+{+ struct sock *sk = priv;+ unsigned long flags;+ struct zc_page *zp;++ spin_lock_irqsave(&sk->zc_lock, flags);+ + BUG_ON(sk->zc_page_index + 1 > sk->zc_page_num);+ + zp = &sk->zc_pages[sk->zc_page_index];++ if (unlikely(size != zp->size)) {+ spin_unlock_irqrestore(&sk->zc_lock, flags);+ return 1;+ }++ + if (zp->used == zp->size) {+ set_bit(ZC_PAGE_READY, &zp->flags);+ if (++sk->zc_page_index == sk->zc_page_num)+ sk->zc_page_index = 0;+ }+ spin_unlock_irqrestore(&sk->zc_lock, flags);++ wake_up(&sk->zc_data_ready);++ return 0;+}++extern struct page * __grab_cache_page(struct address_space *mapping, unsigned long index,+ struct page **cached_page, struct pagevec *lru_pvec);++static int commit_page(struct zc_page *zp, struct file *file, struct address_space *mapping)+{+ int err;+ struct address_space_operations *a_ops = mapping->a_ops;++ flush_dcache_page(zp->page);+ err = a_ops->commit_write(file, zp->page, zp->page_offset, zp->page_offset+zp->used);+ unlock_page(zp->page);+ mark_page_accessed(zp->page);+ page_cache_release(zp->page);+ if (zp->cached_page)+ page_cache_release(zp->cached_page);+ + if (err < 0)+ goto err_out_exit;++ balance_dirty_pages_ratelimited(mapping);++err_out_exit:+ return err;+}++static int prepare_page(struct zc_page *zp, struct file *file, struct address_space *mapping, + loff_t *ppos, loff_t count, struct pagevec *lru_pvec)+{+ unsigned long index;+ unsigned long page_offset;+ unsigned long bytes;+ struct address_space_operations *a_ops = mapping->a_ops;+ loff_t pos_allocated = *ppos;+ int err = 0;+ + page_offset = (pos_allocated & (PAGE_CACHE_SIZE -1));+ index = pos_allocated >> PAGE_CACHE_SHIFT;+ bytes = PAGE_CACHE_SIZE - page_offset;+ if (bytes > count)+ bytes = count;+ + zp->page = __grab_cache_page(mapping, index, &zp->cached_page, lru_pvec);+ if (!zp->page) {+ err = -ENOMEM;+ goto err_out_exit;+ }++ err = a_ops->prepare_write(file, zp->page, page_offset, page_offset+bytes);+ if (unlikely(err)) {+ unlock_page(zp->page);+ page_cache_release(zp->page);+ goto err_out_exit;+ }++ zp->page_offset = page_offset;+ zp->size = bytes;+ zp->used = 0;+ clear_bit(ZC_PAGE_READY, &zp->flags);++ pos_allocated += bytes;++ *ppos = pos_allocated;++err_out_exit:+ return err;+}++ssize_t sock_sendfile(struct file *in_file, loff_t *ppos, size_t count, read_actor_t actor, void *target)+{+ struct socket *sock;+ struct sock *sk;+ int err = 0;+ size_t written = 0;+ struct file *file = target;+ struct address_space *mapping = file->f_mapping;+ struct inode *inode = mapping->host;+ loff_t pos, pos_allocated;+ struct pagevec lru_pvec;+ unsigned long flags;+ int pnum_max = 16, i;+ unsigned int zc_page_index;+ struct zc_page *zc_pages, *zp;++ if (!count)+ return 0;++ pos = pos_allocated = *ppos;+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));+ if (err)+ goto err_out_exit;++ sock = SOCKET_I(in_file->f_dentry->d_inode);++ if (!sock || !sock->sk) {+ err = -ENODEV;+ goto err_out_exit;+ }+ sk = sock->sk;++ pnum_max = ((count >> PAGE_CACHE_SHIFT) > pnum_max)?pnum_max:(count >> PAGE_CACHE_SHIFT);+ zc_pages = kzalloc(sizeof(struct zc_page) * pnum_max, GFP_KERNEL);+ if (!zc_pages) {+ err = -ENOMEM;+ goto err_out_exit;+ }++ pagevec_init(&lru_pvec, 0);+ + err = 0;+ for (i=0; i<pnum_max; ++i) {+ zp = &zc_pages[i];+ + err = prepare_page(zp, file, mapping, &pos_allocated, count, &lru_pvec);+ if (unlikely(err))+ goto err_out_release_pages;+ }++ zc_page_index = 0;+ + spin_lock_irqsave(&sk->zc_lock, flags);+ sk->zc_pages = zc_pages;+ sk->zc_page_num = pnum_max;+ sk->zc_page_index = zc_page_index;+ sk->zc_alloc_data = &zc_sock_alloc_data;+ sk->zc_commit_data = &zc_sock_commit_data;+ spin_unlock_irqrestore(&sk->zc_lock, flags);++ printk("%s: sk=%p, %d pages have been set up.\n", __func__, sk, pnum_max);++ while (count) {+ struct zc_page *zp;+ + interruptible_sleep_on(&sk->zc_data_ready);+ + printk("%s: wakeup: zc_page_index=%d, sk->zc_page_index=%d.\n", __func__, zc_page_index, sk->zc_page_index);++ spin_lock_irqsave(&sk->zc_lock, flags);+ if (zc_page_index == sk->zc_page_index) {+ spin_unlock_irqrestore(&sk->zc_lock, flags);+ goto last_check;+ }+ spin_unlock_irqrestore(&sk->zc_lock, flags);++ for (i=0; i<pnum_max; ++i) {+ zp = &zc_pages[i];+ + if (test_bit(ZC_PAGE_READY, &zp->flags)) {+ printk("%s: checking page %p: page=%p, flags=%08lx, page_offset=%08x, size=%08x, used=%08x\n", + __func__, zp, zp->page, zp->flags, zp->page_offset, zp->size, zp->used);+ err = commit_page(zp, file, mapping);+ if (err)+ goto err_out_release_all_pages;+ + count -= zp->used;+ written += zp->used;+ pos += zp->used;+ + zc_page_index++;++ err = prepare_page(zp, file, mapping, &pos_allocated, count, &lru_pvec);+ }+ }++last_check:+ if (signal_pending(current))+ break;+ }+ + pagevec_lru_add(&lru_pvec);++ *ppos += written;+ err = written;++err_out_release_all_pages:+ i = pnum_max;+err_out_release_pages:+ for (--i; i>=0; --i)+ commit_page(&zc_pages[i], file, mapping);++ kfree(zc_pages);++err_out_exit:+ + return err;+}+ static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -