📄 sock.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $ * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */#include <linux/capability.h>#include <linux/errno.h>#include <linux/types.h>#include <linux/socket.h>#include <linux/in.h>#include <linux/kernel.h>#include <linux/module.h>#include <linux/proc_fs.h>#include <linux/seq_file.h>#include <linux/sched.h>#include <linux/timer.h>#include <linux/string.h>#include <linux/sockios.h>#include <linux/net.h>#include <linux/mm.h>#include <linux/slab.h>#include <linux/interrupt.h>#include <linux/poll.h>#include <linux/tcp.h>#include <linux/init.h>#include <linux/highmem.h>#include <asm/uaccess.h>#include <asm/system.h>#include <linux/netdevice.h>#include <net/protocol.h>#include <linux/skbuff.h>#include <net/net_namespace.h>#include <net/request_sock.h>#include <net/sock.h>#include <net/xfrm.h>#include <linux/ipsec.h>#include <linux/filter.h>#ifdef CONFIG_INET#include <net/tcp.h>#endif/* * Each address family might have different locking rules, so we have * one slock key per address family: */static struct lock_class_key af_family_keys[AF_MAX];static struct lock_class_key af_family_slock_keys[AF_MAX];#ifdef CONFIG_DEBUG_LOCK_ALLOC/* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */static const char *af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , "sk_lock-27" , "sk_lock-28" , "sk_lock-29" , "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"};static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" , "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , "slock-27" , "slock-28" , "slock-29" , "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_MAX"};static const char *af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" , "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , "clock-27" , "clock-28" , "clock-29" , "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_MAX"};#endif/* * sk_callback_lock locking rules are per-address-family, * so split the lock classes by using a per-AF key: */static struct lock_class_key af_callback_keys[AF_MAX];/* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */#define _SK_MEM_PACKETS 256#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)/* Run time adjustable parameters. */__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;/* Maximal space eaten by iovec or ancilliary data plus some space */int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen){ struct timeval tv; if (optlen < sizeof(tv)) return -EINVAL; if (copy_from_user(&tv, optval, sizeof(tv))) return -EFAULT; if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; if (tv.tv_sec < 0) { static int warned __read_mostly; *timeo_p = 0; if (warned < 10 && net_ratelimit()) warned++; printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) " "tries to set negative timeout\n", current->comm, task_pid_nr(current)); return 0; } *timeo_p = MAX_SCHEDULE_TIMEOUT; if (tv.tv_sec == 0 && tv.tv_usec == 0) return 0; if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); return 0;}static void sock_warn_obsolete_bsdism(const char *name){ static int warned; static char warncomm[TASK_COMM_LEN]; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); printk(KERN_WARNING "process `%s' is using obsolete " "%s SO_BSDCOMPAT\n", warncomm, name); warned++; }}static void sock_disable_timestamp(struct sock *sk){ if (sock_flag(sk, SOCK_TIMESTAMP)) { sock_reset_flag(sk, SOCK_TIMESTAMP); net_disable_timestamp(); }}int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb){ int err = 0; int skb_len; /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK */ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf) { err = -ENOMEM; goto out; } err = sk_filter(sk, skb); if (err) goto out; skb->dev = NULL; skb_set_owner_r(skb, sk); /* Cache the SKB length before we tack it onto the receive * queue. Once it is added it no longer belongs to us and * may be freed by other threads of control pulling packets * from the queue. */ skb_len = skb->len; skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, skb_len);out: return err;}EXPORT_SYMBOL(sock_queue_rcv_skb);int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested){ int rc = NET_RX_SUCCESS; if (sk_filter(sk, skb)) goto discard_and_relse; skb->dev = NULL; if (nested) bh_lock_sock_nested(sk); else bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* * trylock + unlock semantics: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); rc = sk->sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); } else sk_add_backlog(sk, skb); bh_unlock_sock(sk);out: sock_put(sk); return rc;discard_and_relse: kfree_skb(skb); goto out;}EXPORT_SYMBOL(sk_receive_skb);struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie){ struct dst_entry *dst = sk->sk_dst_cache; if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk->sk_dst_cache = NULL; dst_release(dst); return NULL; } return dst;}EXPORT_SYMBOL(__sk_dst_check);struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie){ struct dst_entry *dst = sk_dst_get(sk); if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; } return dst;}EXPORT_SYMBOL(sk_dst_check);static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen){ int ret = -ENOPROTOOPT;#ifdef CONFIG_NETDEVICES struct net *net = sk->sk_net; char devname[IFNAMSIZ]; int index; /* Sorry... */ ret = -EPERM; if (!capable(CAP_NET_RAW)) goto out; ret = -EINVAL; if (optlen < 0) goto out; /* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the * name is "" or the option length is zero the socket * is not bound. */ if (optlen > IFNAMSIZ - 1) optlen = IFNAMSIZ - 1; memset(devname, 0, sizeof(devname)); ret = -EFAULT; if (copy_from_user(devname, optval, optlen)) goto out; if (devname[0] == '\0') { index = 0; } else { struct net_device *dev = dev_get_by_name(net, devname); ret = -ENODEV; if (!dev) goto out; index = dev->ifindex; dev_put(dev); } lock_sock(sk); sk->sk_bound_dev_if = index; sk_dst_reset(sk); release_sock(sk); ret = 0;out:#endif return ret;}/* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen){ struct sock *sk=sock->sk; int val; int valbool; struct linger ling; int ret = 0; /* * Options without arguments */#ifdef SO_DONTLINGER /* Compatibility item... */ if (optname == SO_DONTLINGER) { lock_sock(sk); sock_reset_flag(sk, SOCK_LINGER); release_sock(sk); return 0; }#endif if (optname == SO_BINDTODEVICE) return sock_bindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) return -EFAULT; valbool = val?1:0; lock_sock(sk); switch(optname) { case SO_DEBUG: if (val && !capable(CAP_NET_ADMIN)) { ret = -EACCES; } else if (valbool) sock_set_flag(sk, SOCK_DBG); else sock_reset_flag(sk, SOCK_DBG); break; case SO_REUSEADDR: sk->sk_reuse = valbool; break; case SO_TYPE: case SO_ERROR: ret = -ENOPROTOOPT; break; case SO_DONTROUTE: if (valbool) sock_set_flag(sk, SOCK_LOCALROUTE); else sock_reset_flag(sk, SOCK_LOCALROUTE); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); break; case SO_SNDBUF: /* Don't error on this BSD doesn't and if you think about it this is right. Otherwise apps have to play 'guess the biggest size' games. RCVBUF/SNDBUF are treated in BSD as hints */ if (val > sysctl_wmem_max) val = sysctl_wmem_max;set_sndbuf: sk->sk_userlocks |= SOCK_SNDBUF_LOCK; if ((val * 2) < SOCK_MIN_SNDBUF) sk->sk_sndbuf = SOCK_MIN_SNDBUF; else sk->sk_sndbuf = val * 2; /* * Wake up sending tasks if we * upped the value. */ sk->sk_write_space(sk); break; case SO_SNDBUFFORCE: if (!capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } goto set_sndbuf; case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think about it this is right. Otherwise apps have to play 'guess the biggest size' games. RCVBUF/SNDBUF are treated in BSD as hints */ if (val > sysctl_rmem_max) val = sysctl_rmem_max;set_rcvbuf:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -