📄 tcp_ipv4.c
字号:
as state holder. If TW bucket has been already destroyed we fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ if (tw->ts_recent_stamp) { if ((tp->write_seq = tw->snd_nxt+65535+2) == 0) tp->write_seq = 1; tp->ts_recent = tw->ts_recent; tp->ts_recent_stamp = tw->ts_recent_stamp; sock_hold(sk2); skp = &head->chain; goto unique; } else goto not_unique; } } tw = NULL; /* And established part... */ for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) { if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) goto not_unique; }unique: BUG_TRAP(sk->pprev==NULL); if ((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; sk->hashent = hash; sock_prot_inc_use(sk->prot); write_unlock_bh(&head->lock); if (tw) { /* Silly. Should hash-dance instead... */ local_bh_disable(); tcp_tw_deschedule(tw); tcp_timewait_kill(tw); NET_INC_STATS_BH(TimeWaitRecycled); local_bh_enable(); tcp_tw_put(tw); } return 0;not_unique: write_unlock_bh(&head->lock); return -EADDRNOTAVAIL;}/* Hash SYN-SENT socket to established hash table after * checking that it is unique. Note, that without kernel lock * we MUST make these two operations atomically. * * Optimization: if it is bound and tcp_bind_bucket has the only * owner (us), we need not to scan established bucket. */int tcp_v4_hash_connecting(struct sock *sk){ unsigned short snum = sk->num; struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)]; struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev; spin_lock_bh(&head->lock); if (tb->owners == sk && sk->bind_next == NULL) { __tcp_v4_hash(sk); spin_unlock_bh(&head->lock); return 0; } else { spin_unlock_bh(&head->lock); /* No definite answer... Walk to established hash table */ return tcp_v4_check_established(sk); }}/* This will initiate an outgoing connection. */int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; struct sk_buff *buff; struct rtable *rt; u32 daddr, nexthop; int tmp; int err; if (addr_len < sizeof(struct sockaddr_in)) return(-EINVAL); if (usin->sin_family != AF_INET) return(-EAFNOSUPPORT); nexthop = daddr = usin->sin_addr.s_addr; if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) { if (daddr == 0) return -EINVAL; nexthop = sk->protinfo.af_inet.opt->faddr; } tmp = ip_route_connect(&rt, nexthop, sk->saddr, RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if); if (tmp < 0) return tmp; if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } __sk_dst_set(sk, &rt->u.dst); if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr) daddr = rt->rt_dst; err = -ENOBUFS; buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL); if (buff == NULL) goto failure; if (!sk->saddr) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; if (tp->ts_recent_stamp && sk->daddr != daddr) { /* Reset inherited state */ tp->ts_recent = 0; tp->ts_recent_stamp = 0; tp->write_seq = 0; } if (sysctl_tcp_tw_recycle && !tp->ts_recent_stamp && rt->rt_dst == daddr) { struct inet_peer *peer = rt_get_peer(rt); /* VJ's idea. We save last timestamp seen from * the destination in peer table, when entering state TIME-WAIT * and initialize ts_recent from it, when trying new connection. */ if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { tp->ts_recent_stamp = peer->tcp_ts_stamp; tp->ts_recent = peer->tcp_ts; } } sk->dport = usin->sin_port; sk->daddr = daddr; if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, sk->sport, usin->sin_port); tp->ext_header_len = 0; if (sk->protinfo.af_inet.opt) tp->ext_header_len = sk->protinfo.af_inet.opt->optlen; tp->mss_clamp = 536; err = tcp_connect(sk, buff); if (err == 0) return 0;failure: __sk_dst_reset(sk); sk->dport = 0; return err;}static __inline__ int tcp_v4_iif(struct sk_buff *skb){ return ((struct rtable*)skb->dst)->rt_iif;}static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport){ unsigned h = raddr ^ rport; h ^= h>>16; h ^= h>>8; return h&(TCP_SYNQ_HSIZE-1);}static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, struct iphdr *iph, struct tcphdr *th, struct open_request ***prevp){ struct tcp_listen_opt *lopt = tp->listen_opt; struct open_request *req, **prev; __u16 rport = th->source; __u32 raddr = iph->saddr; for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)]; (req = *prev) != NULL; prev = &req->dl_next) { if (req->rmt_port == rport && req->af.v4_req.rmt_addr == raddr && req->af.v4_req.loc_addr == iph->daddr && TCP_INET_FAMILY(req->class->family)) { BUG_TRAP(req->sk == NULL); *prevp = prev; return req; } } return NULL;}static void tcp_v4_synq_add(struct sock *sk, struct open_request *req){ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct tcp_listen_opt *lopt = tp->listen_opt; unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port); req->expires = jiffies + TCP_TIMEOUT_INIT; req->retrans = 0; req->sk = NULL; req->index = h; req->dl_next = lopt->syn_table[h]; write_lock(&tp->syn_wait_lock); lopt->syn_table[h] = req; write_unlock(&tp->syn_wait_lock); tcp_synq_added(sk);}/* * This routine does path mtu discovery as defined in RFC1191. */static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu){ struct dst_entry *dst; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through * unfragmented). */ if (sk->state == TCP_LISTEN) return; /* We don't check in the destentry if pmtu discovery is forbidden * on this route. We just assume that no packet_to_big packets * are send back when pmtu discovery is not active. * There is a small race when the user changes this flag in the * route, but I think that's acceptable. */ if ((dst = __sk_dst_check(sk, 0)) == NULL) return; ip_rt_update_pmtu(dst, mtu); /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ if (mtu < dst->pmtu && ip_dont_fragment(sk, dst)) sk->err_soft = EMSGSIZE; if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT && tp->pmtu_cookie > dst->pmtu) { tcp_sync_mss(sk, dst->pmtu); /* Resend the TCP packet because it's * clear that the old packet has been * dropped. This is the new "fast" path mtu * discovery. */ tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */}/* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the tcp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len){ struct iphdr *iph = (struct iphdr*)dp; struct tcphdr *th; struct tcp_opt *tp; int type = skb->h.icmph->type; int code = skb->h.icmph->code;#if ICMP_MIN_LENGTH < 14 int no_flags = 0;#else#define no_flags 0#endif struct sock *sk; __u32 seq; int err; if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) { ICMP_INC_STATS_BH(IcmpInErrors); return; }#if ICMP_MIN_LENGTH < 14 if (len < (iph->ihl << 2) + 14) no_flags = 1;#endif th = (struct tcphdr*)(dp+(iph->ihl<<2)); sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb)); if (sk == NULL) { ICMP_INC_STATS_BH(IcmpInErrors); return; } if (sk->state == TCP_TIME_WAIT) { tcp_tw_put((struct tcp_tw_bucket*)sk); return; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sk->lock.users != 0) NET_INC_STATS_BH(LockDroppedIcmps); if (sk->state == TCP_CLOSE) goto out; tp = &sk->tp_pinfo.af_tcp; seq = ntohl(th->seq); if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { NET_INC_STATS(OutOfWindowIcmps); goto out; } switch (type) { case ICMP_SOURCE_QUENCH: /* This is deprecated, but if someone generated it, * we have no reasons to ignore it. */ if (sk->lock.users == 0) tcp_enter_cwr(tp); goto out; case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ if (sk->lock.users == 0) do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu)); goto out; } err = icmp_err_convert[code].errno; break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; default: goto out; } switch (sk->state) { struct open_request *req, **prev; case TCP_LISTEN: if (sk->lock.users != 0) goto out; /* The final ACK of the handshake should be already * handled in the new socket context, not here. * Strictly speaking - an ICMP error for the final * ACK should set the opening flag, but that is too * complicated right now. */ if (!no_flags && !th->syn && !th->ack) goto out; req = tcp_v4_search_req(tp, iph, th, &prev); if (!req) goto out; /* ICMPs are not backlogged, hence we cannot get an established socket here. */ BUG_TRAP(req->sk == NULL); if (seq != req->snt_isn) { NET_INC_STATS_BH(OutOfWindowIcmps); goto out; } /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly * created socket, and POSIX does not want network * errors returned from accept(). */ tcp_synq_drop(sk, req, prev); goto out; case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can f.e. if SYNs crossed. */ if (!no_flags && !th->syn) goto out; if (sk->lock.users == 0) { TCP_INC_STATS_BH(TcpAttemptFails); sk->err = err; sk->error_report(sk); tcp_done(sk); } else { sk->err_soft = err; } goto out; } /* If we've already connected we will keep trying * until we time out, or the user gives up. * * rfc1122 4.2.3.9 allows to consider as hard errors * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, * but it is obsoleted by pmtu discovery). * * Note, that in modern internet, where routing is unreliable * and in each dark corner broken firewalls sit, sending random * errors ordered by their masters even this two messages finally lose * their original sense (even Linux sends invalid PORT_UNREACHs) * * Now we are in compliance with RFCs. * --ANK (980905) */ if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) { sk->err = err; sk->error_report(sk); } else { /* Only an error on timeout */ sk->err_soft = err; }out: bh_unlock_sock(sk); sock_put(sk);}/* This routine computes an IPv4 TCP checksum. */void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb){ th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr, csum_partial((char *)th, th->doff<<2, skb->csum));}/* * This routine will send an RST to the other tcp. * * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) * for reset. * Answer: if a packet caused RST, it is not for a socket * existing in our system, if it is matched to a socket, * it is just duplicate segment or bug in other side's TCP. * So that we build reply only basing on parameters * arrived with segment. * Exception: precedence violation. We do not implement it in any case. */static void tcp_v4_send_reset(struct sk_buff *skb){ struct tcphdr *th = skb->h.th; struct tcphdr rth; struct ip_reply_arg arg; /* Never send a reset in response to a reset. */ if (th->rst) return; if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) return; /* Swap the send and the receive. */ memset(&rth, 0, sizeof(struct tcphdr)); rth.dest = th->source; rth.source = th->dest; rth.doff = sizeof(struct tcphdr)/4; rth.rst = 1; if (th->ack) { rth.seq = th->ack_seq; } else { rth.ack = 1; rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff<<2)); } memset(&arg, 0, sizeof arg); arg.iov[0].iov_base = (unsigned char *)&rth; arg.iov[0].iov_len = sizeof rth; arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, skb->nh.iph->saddr, /*XXX*/ sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.n_iov = 1; arg.csumoffset = offsetof(struct tcphdr, check) / 2; ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); TCP_INC_STATS_BH(TcpOutSegs); TCP_INC_STATS_BH(TcpOutRsts);}/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states outside socket context is ugly, certainly. What can I do? */static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts){ struct tcphdr *th = skb->h.th; struct { struct tcphdr th; u32 tsopt[3]; } rep; struct ip_reply_arg arg; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof arg); arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); arg.n_iov = 1; if (ts) { rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) |
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -