tcp_ipv4.c

来自「Linux内核源代码为压缩文件是<<Linux内核>&gt」· C语言代码 · 共 2,217 行 · 第 1/4 页
2,217 行
			   as state holder.			   If TW bucket has been already destroyed we			   fall back to VJ's scheme and use initial			   timestamp retrieved from peer table.			 */			if (tw->ts_recent_stamp) {				if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)					tp->write_seq = 1;				tp->ts_recent = tw->ts_recent;				tp->ts_recent_stamp = tw->ts_recent_stamp;				sock_hold(sk2);				skp = &head->chain;				goto unique;			} else				goto not_unique;		}	}	tw = NULL;	/* And established part... */	for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {		if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))			goto not_unique;	}unique:	BUG_TRAP(sk->pprev==NULL);	if ((sk->next = *skp) != NULL)		(*skp)->pprev = &sk->next;	*skp = sk;	sk->pprev = skp;	sk->hashent = hash;	sock_prot_inc_use(sk->prot);	write_unlock_bh(&head->lock);	if (tw) {		/* Silly. Should hash-dance instead... */		local_bh_disable();		tcp_tw_deschedule(tw);		tcp_timewait_kill(tw);		NET_INC_STATS_BH(TimeWaitRecycled);		local_bh_enable();		tcp_tw_put(tw);	}	return 0;not_unique:	write_unlock_bh(&head->lock);	return -EADDRNOTAVAIL;}/* Hash SYN-SENT socket to established hash table after * checking that it is unique. Note, that without kernel lock * we MUST make these two operations atomically. * * Optimization: if it is bound and tcp_bind_bucket has the only * owner (us), we need not to scan established bucket. */int tcp_v4_hash_connecting(struct sock *sk){	unsigned short snum = sk->num;	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];	struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;	spin_lock_bh(&head->lock);	if (tb->owners == sk && sk->bind_next == NULL) {		__tcp_v4_hash(sk);		spin_unlock_bh(&head->lock);		return 0;	} else {		spin_unlock_bh(&head->lock);		/* No definite answer... Walk to established hash table */		return tcp_v4_check_established(sk);	}}/* This will initiate an outgoing connection. */int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len){	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;	struct sk_buff *buff;	struct rtable *rt;	u32 daddr, nexthop;	int tmp;	int err;	if (addr_len < sizeof(struct sockaddr_in))		return(-EINVAL);	if (usin->sin_family != AF_INET)		return(-EAFNOSUPPORT);	nexthop = daddr = usin->sin_addr.s_addr;	if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {		if (daddr == 0)			return -EINVAL;		nexthop = sk->protinfo.af_inet.opt->faddr;	}	tmp = ip_route_connect(&rt, nexthop, sk->saddr,			       RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);	if (tmp < 0)		return tmp;	if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {		ip_rt_put(rt);		return -ENETUNREACH;	}	__sk_dst_set(sk, &rt->u.dst);	if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)		daddr = rt->rt_dst;	err = -ENOBUFS;	buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);	if (buff == NULL)		goto failure;	if (!sk->saddr)		sk->saddr = rt->rt_src;	sk->rcv_saddr = sk->saddr;	if (tp->ts_recent_stamp && sk->daddr != daddr) {		/* Reset inherited state */		tp->ts_recent = 0;		tp->ts_recent_stamp = 0;		tp->write_seq = 0;	}	if (sysctl_tcp_tw_recycle &&	    !tp->ts_recent_stamp &&	    rt->rt_dst == daddr) {		struct inet_peer *peer = rt_get_peer(rt);		/* VJ's idea. We save last timestamp seen from		 * the destination in peer table, when entering state TIME-WAIT		 * and initialize ts_recent from it, when trying new connection.		 */		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {			tp->ts_recent_stamp = peer->tcp_ts_stamp;			tp->ts_recent = peer->tcp_ts;		}	}	sk->dport = usin->sin_port;	sk->daddr = daddr;	if (!tp->write_seq)		tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,							   sk->sport, usin->sin_port);	tp->ext_header_len = 0;	if (sk->protinfo.af_inet.opt)		tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;	tp->mss_clamp = 536;	err = tcp_connect(sk, buff);	if (err == 0)		return 0;failure:	__sk_dst_reset(sk);	sk->dport = 0;	return err;}static __inline__ int tcp_v4_iif(struct sk_buff *skb){	return ((struct rtable*)skb->dst)->rt_iif;}static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport){	unsigned h = raddr ^ rport;	h ^= h>>16;	h ^= h>>8;	return h&(TCP_SYNQ_HSIZE-1);}static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, 					      struct iphdr *iph,					      struct tcphdr *th,					      struct open_request ***prevp){	struct tcp_listen_opt *lopt = tp->listen_opt;	struct open_request *req, **prev;  	__u16 rport = th->source;	__u32 raddr = iph->saddr;	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];	     (req = *prev) != NULL;	     prev = &req->dl_next) {		if (req->rmt_port == rport &&		    req->af.v4_req.rmt_addr == raddr &&		    req->af.v4_req.loc_addr == iph->daddr &&		    TCP_INET_FAMILY(req->class->family)) {			BUG_TRAP(req->sk == NULL);			*prevp = prev;			return req; 		}	}	return NULL;}static void tcp_v4_synq_add(struct sock *sk, struct open_request *req){	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;	struct tcp_listen_opt *lopt = tp->listen_opt;	unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);	req->expires = jiffies + TCP_TIMEOUT_INIT;	req->retrans = 0;	req->sk = NULL;	req->index = h;	req->dl_next = lopt->syn_table[h];	write_lock(&tp->syn_wait_lock);	lopt->syn_table[h] = req;	write_unlock(&tp->syn_wait_lock);	tcp_synq_added(sk);}/*  * This routine does path mtu discovery as defined in RFC1191. */static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu){	struct dst_entry *dst;	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs	 * send out by Linux are always <576bytes so they should go through	 * unfragmented).	 */	if (sk->state == TCP_LISTEN)		return; 	/* We don't check in the destentry if pmtu discovery is forbidden	 * on this route. We just assume that no packet_to_big packets	 * are send back when pmtu discovery is not active.     	 * There is a small race when the user changes this flag in the	 * route, but I think that's acceptable.	 */	if ((dst = __sk_dst_check(sk, 0)) == NULL)		return;	ip_rt_update_pmtu(dst, mtu);	/* Something is about to be wrong... Remember soft error	 * for the case, if this connection will not able to recover.	 */	if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))		sk->err_soft = EMSGSIZE;	if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&	    tp->pmtu_cookie > dst->pmtu) {		tcp_sync_mss(sk, dst->pmtu);		/* Resend the TCP packet because it's  		 * clear that the old packet has been		 * dropped. This is the new "fast" path mtu		 * discovery.		 */		tcp_simple_retransmit(sk);	} /* else let the usual retransmit timer handle it */}/* * This routine is called by the ICMP module when it gets some * sort of error condition.  If err < 0 then the socket should * be closed and the error returned to the user.  If err > 0 * it's just the icmp type << 8 | icmp code.  After adjustment * header points to the first 8 bytes of the tcp header.  We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len){	struct iphdr *iph = (struct iphdr*)dp;	struct tcphdr *th; 	struct tcp_opt *tp;	int type = skb->h.icmph->type;	int code = skb->h.icmph->code;#if ICMP_MIN_LENGTH < 14	int no_flags = 0;#else#define no_flags 0#endif	struct sock *sk;	__u32 seq;	int err;	if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) { 		ICMP_INC_STATS_BH(IcmpInErrors); 		return;	}#if ICMP_MIN_LENGTH < 14	if (len < (iph->ihl << 2) + 14)		no_flags = 1;#endif	th = (struct tcphdr*)(dp+(iph->ihl<<2));	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));	if (sk == NULL) {		ICMP_INC_STATS_BH(IcmpInErrors);		return;	}	if (sk->state == TCP_TIME_WAIT) {		tcp_tw_put((struct tcp_tw_bucket*)sk);		return;	}	bh_lock_sock(sk);	/* If too many ICMPs get dropped on busy	 * servers this needs to be solved differently.	 */	if (sk->lock.users != 0)		NET_INC_STATS_BH(LockDroppedIcmps);	if (sk->state == TCP_CLOSE)		goto out;	tp = &sk->tp_pinfo.af_tcp;	seq = ntohl(th->seq);	if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {		NET_INC_STATS(OutOfWindowIcmps);		goto out;	}	switch (type) {	case ICMP_SOURCE_QUENCH:		/* This is deprecated, but if someone generated it,		 * we have no reasons to ignore it.		 */		if (sk->lock.users == 0)			tcp_enter_cwr(tp);		goto out;	case ICMP_PARAMETERPROB:		err = EPROTO;		break; 	case ICMP_DEST_UNREACH:		if (code > NR_ICMP_UNREACH)			goto out;		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */			if (sk->lock.users == 0)				do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));			goto out;		}		err = icmp_err_convert[code].errno;		break;	case ICMP_TIME_EXCEEDED:		err = EHOSTUNREACH;		break;	default:		goto out;	}	switch (sk->state) {		struct open_request *req, **prev;	case TCP_LISTEN:		if (sk->lock.users != 0)			goto out;		/* The final ACK of the handshake should be already 		 * handled in the new socket context, not here.		 * Strictly speaking - an ICMP error for the final		 * ACK should set the opening flag, but that is too		 * complicated right now. 		 */ 		if (!no_flags && !th->syn && !th->ack)			goto out;		req = tcp_v4_search_req(tp, iph, th, &prev); 		if (!req)			goto out;		/* ICMPs are not backlogged, hence we cannot get		   an established socket here.		 */		BUG_TRAP(req->sk == NULL);		if (seq != req->snt_isn) {			NET_INC_STATS_BH(OutOfWindowIcmps);			goto out;		}		/* 		 * Still in SYN_RECV, just remove it silently.		 * There is no good way to pass the error to the newly		 * created socket, and POSIX does not want network		 * errors returned from accept(). 		 */ 		tcp_synq_drop(sk, req, prev);		goto out;	case TCP_SYN_SENT:	case TCP_SYN_RECV:  /* Cannot happen.			       It can f.e. if SYNs crossed.			     */ 		if (!no_flags && !th->syn)			goto out;		if (sk->lock.users == 0) {			TCP_INC_STATS_BH(TcpAttemptFails);			sk->err = err;			sk->error_report(sk);			tcp_done(sk);		} else {			sk->err_soft = err;		}		goto out;	}	/* If we've already connected we will keep trying	 * until we time out, or the user gives up.	 *	 * rfc1122 4.2.3.9 allows to consider as hard errors	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,	 * but it is obsoleted by pmtu discovery).	 *	 * Note, that in modern internet, where routing is unreliable	 * and in each dark corner broken firewalls sit, sending random	 * errors ordered by their masters even this two messages finally lose	 * their original sense (even Linux sends invalid PORT_UNREACHs)	 *	 * Now we are in compliance with RFCs.	 *							--ANK (980905)	 */	if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {		sk->err = err;		sk->error_report(sk);	} else	{ /* Only an error on timeout */		sk->err_soft = err;	}out:	bh_unlock_sock(sk);	sock_put(sk);}/* This routine computes an IPv4 TCP checksum. */void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 		       struct sk_buff *skb){	th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,				 csum_partial((char *)th, th->doff<<2, skb->csum));}/* *	This routine will send an RST to the other tcp. * *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) *		      for reset. *	Answer: if a packet caused RST, it is not for a socket *		existing in our system, if it is matched to a socket, *		it is just duplicate segment or bug in other side's TCP. *		So that we build reply only basing on parameters *		arrived with segment. *	Exception: precedence violation. We do not implement it in any case. */static void tcp_v4_send_reset(struct sk_buff *skb){	struct tcphdr *th = skb->h.th;	struct tcphdr rth;	struct ip_reply_arg arg;	/* Never send a reset in response to a reset. */	if (th->rst)		return;	if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)		return;	/* Swap the send and the receive. */	memset(&rth, 0, sizeof(struct tcphdr)); 	rth.dest = th->source;	rth.source = th->dest; 	rth.doff = sizeof(struct tcphdr)/4;	rth.rst = 1;	if (th->ack) {		rth.seq = th->ack_seq;	} else {		rth.ack = 1;		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin				    + skb->len - (th->doff<<2));	}	memset(&arg, 0, sizeof arg); 	arg.iov[0].iov_base = (unsigned char *)&rth; 	arg.iov[0].iov_len  = sizeof rth;	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 				      skb->nh.iph->saddr, /*XXX*/				      sizeof(struct tcphdr),				      IPPROTO_TCP,				      0); 	arg.n_iov = 1;	arg.csumoffset = offsetof(struct tcphdr, check) / 2; 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);	TCP_INC_STATS_BH(TcpOutSegs);	TCP_INC_STATS_BH(TcpOutRsts);}/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states   outside socket context is ugly, certainly. What can I do? */static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts){	struct tcphdr *th = skb->h.th;	struct {		struct tcphdr th;		u32 tsopt[3];	} rep;	struct ip_reply_arg arg;	memset(&rep.th, 0, sizeof(struct tcphdr));	memset(&arg, 0, sizeof arg);	arg.iov[0].iov_base = (unsigned char *)&rep; 	arg.iov[0].iov_len  = sizeof(rep.th);	arg.n_iov = 1;	if (ts) {		rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |						(TCPOPT_NOP << 16) |						(TCPOPT_TIMESTAMP << 8) |
tcp_ipv4.c - 源码说明

本页面展示了「Linux内核源代码为压缩文件是<<Linux内核>>一书中的源代码」中的 tcp_ipv4.c 源码文件，采用 C语言编程语言编写，共 2,217 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Linux相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?