📄 tcp_output.c
字号:
* We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask){ const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; int tcp_header_size;#ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *md5; __u8 *md5_hash_location;#endif struct tcphdr *th; int sysctl_flags; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); /* If congestion control is doing timestamping, we must * take such a timestamp before we potentially clone/copy. */ if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) __net_timestamp(skb); if (likely(clone_it)) { if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; } inet = inet_sk(sk); tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); tcp_header_size = tp->tcp_header_len;#define SYSCTL_FLAG_TSTAMPS 0x1#define SYSCTL_FLAG_WSCALE 0x2#define SYSCTL_FLAG_SACK 0x4 sysctl_flags = 0; if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; if (sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } if (sysctl_tcp_window_scaling) { tcp_header_size += TCPOLEN_WSCALE_ALIGNED; sysctl_flags |= SYSCTL_FLAG_WSCALE; } if (sysctl_tcp_sack) { sysctl_flags |= SYSCTL_FLAG_SACK; if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } } else if (unlikely(tp->rx_opt.eff_sacks)) { /* A SACK is 2 pad bytes, a 2 byte header, plus * 2 32-bit sequence numbers for each SACK block. */ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START);#ifdef CONFIG_TCP_MD5SIG /* * Are we doing MD5 on this segment? If so - make * room for it. */ md5 = tp->af_specific->md5_lookup(sk, sk); if (md5) tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;#endif skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_set_owner_w(skb, sk); /* Build TCP header and checksum it. */ th = tcp_hdr(skb); th->source = inet->sport; th->dest = inet->dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U)); } else { th->window = htons(tcp_select_window(sk)); } th->check = 0; th->urg_ptr = 0; if (unlikely(tp->urg_mode && between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { th->urg_ptr = htons(tp->snd_up-tcb->seq); th->urg = 1; } if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { tcp_syn_build_options((__be32 *)(th + 1), tcp_advertise_mss(sk), (sysctl_flags & SYSCTL_FLAG_TSTAMPS), (sysctl_flags & SYSCTL_FLAG_SACK), (sysctl_flags & SYSCTL_FLAG_WSCALE), tp->rx_opt.rcv_wscale, tcb->when, tp->rx_opt.ts_recent,#ifdef CONFIG_TCP_MD5SIG md5 ? &md5_hash_location :#endif NULL); } else { tcp_build_and_update_options((__be32 *)(th + 1), tp, tcb->when,#ifdef CONFIG_TCP_MD5SIG md5 ? &md5_hash_location :#endif NULL); TCP_ECN_send(sk, skb, tcp_header_size); }#ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { tp->af_specific->calc_md5_hash(md5_hash_location, md5, sk, NULL, NULL, tcp_hdr(skb), sk->sk_protocol, skb->len); }#endif icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_INC_STATS(TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) return err; tcp_enter_cwr(sk, 1); return net_xmit_eval(err);#undef SYSCTL_FLAG_TSTAMPS#undef SYSCTL_FLAG_WSCALE#undef SYSCTL_FLAG_SACK}/* This routine just queue's the buffer * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb){ struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_charge_skb(sk, skb);}static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now){ if (skb->len <= mss_now || !sk_can_gso(sk)) { /* Avoid the costly divide in the normal * non-TSO case. */ skb_shinfo(skb)->gso_segs = 1; skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; } else { skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); skb_shinfo(skb)->gso_size = mss_now; skb_shinfo(skb)->gso_type = sk->sk_gso_type; }}/* When a modification to fackets out becomes necessary, we need to check * skb is counted to fackets_out or not. Another important thing is to * tweak SACK fastpath hint too as it would overwrite all changes unless * hint is also changed. */static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, int decr){ if (!tp->sacked_out || tcp_is_reno(tp)) return; if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq)) tp->fackets_out -= decr; /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */ if (tp->fastpath_skb_hint != NULL && after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) tp->fastpath_cnt_hint -= decr;}/* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now){ struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int nsize, old_factor; int nlen; u16 flags; BUG_ON(len > skb->len); tcp_clear_retrans_hints_partial(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; if (skb_cloned(skb) && skb_is_nonlinear(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ sk_charge_skb(sk, buff); nlen = skb->len - len - nsize; buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; if (tcp_is_sack(tp) && tp->sacked_out && (TCP_SKB_CB(skb)->seq == tp->highest_sack)) tp->highest_sack = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { /* Copy and checksum data tail into the new buffer. */ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), nsize, 0); skb_trim(skb, len); skb->csum = csum_block_sub(skb->csum, buff->csum, len); } else { skb->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); } buff->ip_summed = skb->ip_summed; /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; buff->tstamp = skb->tstamp; old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(sk, skb, mss_now); tcp_set_skb_tso_segs(sk, buff, mss_now); /* If this packet has been sent out already, we must * adjust the various packet counters. */ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { int diff = old_factor - tcp_skb_pcount(skb) - tcp_skb_pcount(buff); tp->packets_out -= diff; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= diff; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= diff; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= diff; /* Adjust Reno SACK estimate. */ if (tcp_is_reno(tp) && diff > 0) { tcp_dec_pcount_approx_int(&tp->sacked_out, diff); tcp_verify_left_out(tp); } tcp_adjust_fackets_out(tp, skb, diff); } /* Link BUFF into the send queue. */ skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0;}/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c * eventually). The difference is that pulled data not copied, but * immediately discarded. */static void __pskb_trim_head(struct sk_buff *skb, int len){ int i, k, eat; eat = len; k = 0; for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { if (skb_shinfo(skb)->frags[i].size <= eat) { put_page(skb_shinfo(skb)->frags[i].page); eat -= skb_shinfo(skb)->frags[i].size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; skb_shinfo(skb)->frags[k].size -= eat; eat = 0; } k++; } } skb_shinfo(skb)->nr_frags = k; skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len;}int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len){ if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; /* If len == headlen, we avoid __skb_pull to preserve alignment. */ if (unlikely(len < skb_headlen(skb))) __skb_pull(skb, len); else __pskb_trim_head(skb, len - skb_headlen(skb)); TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_PARTIAL; skb->truesize -= len; sk->sk_wmem_queued -= len; sk->sk_forward_alloc += len; sock_set_flag(sk, SOCK_QUEUE_SHRUNK); /* Any change of skb->len requires recalculation of tso * factor and mss. */ if (tcp_skb_pcount(skb) > 1) tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1)); return 0;}/* Not accounting for SACKs here. */int tcp_mtu_to_mss(struct sock *sk, int pmtu){ struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; /* Now subtract TCP options size, not including SACKs */ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); return mss_now;}/* Inverse of above */int tcp_mss_to_mtu(struct sock *sk, int mss){ struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mtu; mtu = mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; return mtu;}void tcp_mtup_init(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -