^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * INET An implementation of the TCP/IP protocol suite for the LINUX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * operating system. INET is implemented using the BSD Socket
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * interface as the means of communication with the user level.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Implementation of the Transmission Control Protocol(TCP).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * Authors: Ross Biro
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * Mark Evans, <evansmp@uhura.aston.ac.uk>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Corey Minyard <wf-rch!minyard@relay.EU.net>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * Florian La Roche, <flla@stud.uni-sb.de>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * Linus Torvalds, <torvalds@cs.helsinki.fi>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * Alan Cox, <gw4pts@gw4pts.ampr.org>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * Matthew Dillon, <dillon@apollo.west.oic.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * Jorge Cwik, <jorge@laser.satlink.net>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * Changes: Pedro Roque : Retransmit queue handled by TCP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * : Fragmentation on mtu decrease
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * : Segment collapse on retransmit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * : AF independence
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * Linus Torvalds : send_delayed_ack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * David S. Miller : Charge memory using the right skb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * during syn/ack processing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * David S. Miller : Output engine completely rewritten.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) * Cacophonix Gaul : draft-minshall-nagle-01
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * J Hadi Salim : ECN support
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #define pr_fmt(fmt) "TCP: " fmt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <net/tcp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <net/mptcp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/compiler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/gfp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <linux/static_key.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <trace/events/tcp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) /* Refresh clocks of a TCP socket,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * ensuring monotically increasing values.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) void tcp_mstamp_refresh(struct tcp_sock *tp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) u64 val = tcp_clock_ns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) tp->tcp_clock_cache = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) int push_one, gfp_t gfp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) /* Account for new data that has been sent to the network. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) unsigned int prior_packets = tp->packets_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) __skb_unlink(skb, &sk->sk_write_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) if (tp->highest_sack == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) tp->highest_sack = skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) tp->packets_out += tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) tcp_rearm_rto(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) tcp_skb_pcount(skb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * window scaling factor due to loss of precision.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * If window has been shrunk, what should we make? It is not clear at all.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * invalid. OK, let's make this for now:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) static inline __u32 tcp_acceptable_seq(const struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) const struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) (tp->rx_opt.wscale_ok &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) return tp->snd_nxt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) return tcp_wnd_end(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) /* Calculate mss to advertise in SYN segment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) * 1. It is independent of path mtu.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * attached devices, because some buggy hosts are confused by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) * large MSS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * 4. We do not make 3, we advertise MSS, calculated from first
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * hop device mtu, but allow to raise it to ip_rt_min_advmss.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * This may be overridden via information stored in routing table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * probably even Jumbo".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) static __u16 tcp_advertise_mss(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) const struct dst_entry *dst = __sk_dst_get(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) int mss = tp->advmss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) if (dst) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) unsigned int metric = dst_metric_advmss(dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) if (metric < mss) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) mss = metric;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) tp->advmss = mss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) return (__u16)mss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * This is the first part of cwnd validation mechanism.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) void tcp_cwnd_restart(struct sock *sk, s32 delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) u32 cwnd = tp->snd_cwnd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) tp->snd_ssthresh = tcp_current_ssthresh(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) restart_cwnd = min(restart_cwnd, cwnd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) cwnd >>= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) tp->snd_cwnd = max(cwnd, restart_cwnd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) tp->snd_cwnd_stamp = tcp_jiffies32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) tp->snd_cwnd_used = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) /* Congestion state accounting after a packet has been sent. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) static void tcp_event_data_sent(struct tcp_sock *tp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) const u32 now = tcp_jiffies32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) if (tcp_packets_in_flight(tp) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) tcp_ca_event(sk, CA_EVENT_TX_START);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) /* If this is the first data packet sent in response to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) * previous received data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * and it is a reply for ato after last received packet,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * increase pingpong count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) inet_csk_inc_pingpong_cnt(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) tp->lsndtime = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) /* Account for an ACK we sent. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) u32 rcv_nxt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) if (unlikely(tp->compressed_ack)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) tp->compressed_ack);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) tp->compressed_ack = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) __sock_put(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) if (unlikely(rcv_nxt != tp->rcv_nxt))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) return; /* Special ACK sent by DCTCP to reflect ECN */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) tcp_dec_quickack_mode(sk, pkts);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) /* Determine a window scaling and initial window to offer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) * Based on the assumption that the given amount of space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) * will be offered. Store the results in the tp structure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) * NOTE: for smooth operation initial space offering should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * be a multiple of mss if possible. We assume here that mss >= 1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) * This MUST be enforced by all callers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) __u32 *rcv_wnd, __u32 *window_clamp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) int wscale_ok, __u8 *rcv_wscale,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) __u32 init_rcv_wnd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) unsigned int space = (__space < 0 ? 0 : __space);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) /* If no clamp set the clamp to the max possible scaled window */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) if (*window_clamp == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) space = min(*window_clamp, space);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) /* Quantize space offering to a multiple of mss if possible. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) if (space > mss)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) space = rounddown(space, mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) /* NOTE: offering an initial window larger than 32767
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) * will break some buggy TCP stacks. If the admin tells us
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) * it is likely we could be speaking with such a buggy stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * we will truncate our initial window offering to 32K-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) * unless the remote has sent us a window scaling option,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * which we interpret as a sign the remote TCP is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) * misinterpreting the window field as a signed quantity.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) (*rcv_wnd) = min_t(u32, space, U16_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) if (init_rcv_wnd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) *rcv_wscale = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) if (wscale_ok) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) /* Set window scaling on max possible window */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) space = max_t(u32, space, sysctl_rmem_max);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) space = min_t(u32, space, *window_clamp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) *rcv_wscale = clamp_t(int, ilog2(space) - 15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 0, TCP_MAX_WSCALE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) /* Set the clamp no higher than max representable value */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) EXPORT_SYMBOL(tcp_select_initial_window);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) /* Chose a new window to advertise, update state in tcp_sock for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) * socket, and return result with RFC1323 scaling applied. The return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) * value can be stuffed directly into th->window for an outgoing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) * frame.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) static u16 tcp_select_window(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) u32 old_win = tp->rcv_wnd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) u32 cur_win = tcp_receive_window(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) u32 new_win = __tcp_select_window(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) /* Never shrink the offered window */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) if (new_win < cur_win) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) /* Danger Will Robinson!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) * Don't update rcv_wup/rcv_wnd here or else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * we will not be able to advertise a zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) * window in time. --DaveM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) * Relax Will Robinson.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) if (new_win == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) NET_INC_STATS(sock_net(sk),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) LINUX_MIB_TCPWANTZEROWINDOWADV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) tp->rcv_wnd = new_win;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) tp->rcv_wup = tp->rcv_nxt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) /* Make sure we do not exceed the maximum possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) * scaled window.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) if (!tp->rx_opt.rcv_wscale &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) new_win = min(new_win, MAX_TCP_WINDOW);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) /* RFC1323 scaling applied */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) new_win >>= tp->rx_opt.rcv_wscale;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) /* If we advertise zero window, disable fast path. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) if (new_win == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) tp->pred_flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) if (old_win)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) NET_INC_STATS(sock_net(sk),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) LINUX_MIB_TCPTOZEROWINDOWADV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) } else if (old_win == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) return new_win;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) /* Packet ECN state for a SYN-ACK */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) const struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) if (!(tp->ecn_flags & TCP_ECN_OK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) else if (tcp_ca_needs_ecn(sk) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) tcp_bpf_ca_needs_ecn(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) INET_ECN_xmit(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) /* Packet ECN state for a SYN. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) if (!use_ecn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) const struct dst_entry *dst = __sk_dst_get(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) use_ecn = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) tp->ecn_flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) if (use_ecn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) tp->ecn_flags = TCP_ECN_OK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) INET_ECN_xmit(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) /* tp->ecn_flags are cleared at a later point in time when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) * SYN ACK is ultimatively being received.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) if (inet_rsk(req)->ecn_ok)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) th->ece = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * be sent.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) struct tcphdr *th, int tcp_header_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) if (tp->ecn_flags & TCP_ECN_OK) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) /* Not-retransmitted data segment: set ECT and inject CWR. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) if (skb->len != tcp_header_len &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) INET_ECN_xmit(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) th->cwr = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) } else if (!tcp_ca_needs_ecn(sk)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) /* ACK or retransmitted segment: clear ECT|CE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) INET_ECN_dontxmit(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) th->ece = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) /* Constructs common control bits of non-data skb. If SYN/FIN is present,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * auto increment end seqno.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) skb->ip_summed = CHECKSUM_PARTIAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) TCP_SKB_CB(skb)->tcp_flags = flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) TCP_SKB_CB(skb)->sacked = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) tcp_skb_pcount_set(skb, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) TCP_SKB_CB(skb)->seq = seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) if (flags & (TCPHDR_SYN | TCPHDR_FIN))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) seq++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) TCP_SKB_CB(skb)->end_seq = seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) static inline bool tcp_urg_mode(const struct tcp_sock *tp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) return tp->snd_una != tp->snd_up;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) #define OPTION_SACK_ADVERTISE (1 << 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) #define OPTION_TS (1 << 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) #define OPTION_MD5 (1 << 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) #define OPTION_WSCALE (1 << 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) #define OPTION_FAST_OPEN_COOKIE (1 << 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) #define OPTION_SMC (1 << 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) #define OPTION_MPTCP (1 << 10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) static void smc_options_write(__be32 *ptr, u16 *options)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) #if IS_ENABLED(CONFIG_SMC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) if (static_branch_unlikely(&tcp_have_smc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) if (unlikely(OPTION_SMC & *options)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) *ptr++ = htonl((TCPOPT_NOP << 24) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) (TCPOPT_NOP << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) (TCPOPT_EXP << 8) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) (TCPOLEN_EXP_SMC_BASE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) *ptr++ = htonl(TCPOPT_SMC_MAGIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) struct tcp_out_options {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) u16 options; /* bit field of OPTION_* */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) u16 mss; /* 0 to disable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) u8 ws; /* window scale, 0 to disable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) u8 num_sack_blocks; /* number of SACK blocks to include */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) u8 hash_size; /* bytes in hash_location */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) u8 bpf_opt_len; /* length of BPF hdr option */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) __u8 *hash_location; /* temporary pointer, overloaded */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) __u32 tsval, tsecr; /* need to include OPTION_TS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) struct mptcp_out_options mptcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) #if IS_ENABLED(CONFIG_MPTCP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) if (unlikely(OPTION_MPTCP & opts->options))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) mptcp_write_options(ptr, &opts->mptcp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) #ifdef CONFIG_CGROUP_BPF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) enum tcp_synack_type synack_type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) if (unlikely(!skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) return BPF_WRITE_HDR_TCP_CURRENT_MSS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) if (unlikely(synack_type == TCP_SYNACK_COOKIE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) /* req, syn_skb and synack_type are used when writing synack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) struct request_sock *req,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) struct sk_buff *syn_skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) enum tcp_synack_type synack_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) struct tcp_out_options *opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) unsigned int *remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) struct bpf_sock_ops_kern sock_ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) !*remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) /* init sock_ops */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) if (req) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) /* The listen "sk" cannot be passed here because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) * it is not locked. It would not make too much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) * sense to do bpf_setsockopt(listen_sk) based
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) * on individual connection request also.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) * Thus, "req" is passed here and the cgroup-bpf-progs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) * of the listen "sk" will be run.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) * "req" is also used here for fastopen even the "sk" here is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) * a fullsock "child" sk. It is to keep the behavior
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) * consistent between fastopen and non-fastopen on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) * the bpf programming side.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) sock_ops.sk = (struct sock *)req;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) sock_ops.syn_skb = syn_skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) sock_owned_by_me(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) sock_ops.is_fullsock = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) sock_ops.sk = sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) sock_ops.remaining_opt_len = *remaining;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) /* tcp_current_mss() does not pass a skb */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) if (skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) bpf_skops_init_skb(&sock_ops, skb, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) if (err || sock_ops.remaining_opt_len == *remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) /* round up to 4 bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) *remaining -= opts->bpf_opt_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) struct request_sock *req,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) struct sk_buff *syn_skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) enum tcp_synack_type synack_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) struct tcp_out_options *opts)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) struct bpf_sock_ops_kern sock_ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) if (likely(!max_opt_len))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) if (req) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) sock_ops.sk = (struct sock *)req;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) sock_ops.syn_skb = syn_skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) sock_owned_by_me(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) sock_ops.is_fullsock = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) sock_ops.sk = sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) sock_ops.remaining_opt_len = max_opt_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) first_opt_off = tcp_hdrlen(skb) - max_opt_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) nr_written = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) nr_written = max_opt_len - sock_ops.remaining_opt_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) if (nr_written < max_opt_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) max_opt_len - nr_written);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) struct request_sock *req,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) struct sk_buff *syn_skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) enum tcp_synack_type synack_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) struct tcp_out_options *opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) unsigned int *remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) struct request_sock *req,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) struct sk_buff *syn_skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) enum tcp_synack_type synack_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) struct tcp_out_options *opts)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) /* Write previously computed TCP options to the packet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) * Beware: Something in the Internet is very sensitive to the ordering of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) * TCP options, we learned this through the hard way, so be careful here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) * Luckily we can at least blame others for their non-compliance but from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) * inter-operability perspective it seems that we're somewhat stuck with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) * the ordering which we have been using if we want to keep working with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) * those broken things (not that it currently hurts anybody as there isn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) * particular reason why the ordering would need to be changed).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) * At least SACK_PERM as the first option is known to lead to a disaster
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) * (but it may well be that other scenarios fail similarly).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) struct tcp_out_options *opts)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) u16 options = opts->options; /* mungable copy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) if (unlikely(OPTION_MD5 & options)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) /* overload cookie hash location */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) opts->hash_location = (__u8 *)ptr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) ptr += 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) if (unlikely(opts->mss)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) *ptr++ = htonl((TCPOPT_MSS << 24) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) (TCPOLEN_MSS << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) opts->mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) if (likely(OPTION_TS & options)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) if (unlikely(OPTION_SACK_ADVERTISE & options)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) (TCPOLEN_SACK_PERM << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) (TCPOPT_TIMESTAMP << 8) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) TCPOLEN_TIMESTAMP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) options &= ~OPTION_SACK_ADVERTISE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) *ptr++ = htonl((TCPOPT_NOP << 24) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) (TCPOPT_NOP << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) (TCPOPT_TIMESTAMP << 8) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) TCPOLEN_TIMESTAMP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) *ptr++ = htonl(opts->tsval);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) *ptr++ = htonl(opts->tsecr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) if (unlikely(OPTION_SACK_ADVERTISE & options)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) *ptr++ = htonl((TCPOPT_NOP << 24) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) (TCPOPT_NOP << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) (TCPOPT_SACK_PERM << 8) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) TCPOLEN_SACK_PERM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) if (unlikely(OPTION_WSCALE & options)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) *ptr++ = htonl((TCPOPT_NOP << 24) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) (TCPOPT_WINDOW << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) (TCPOLEN_WINDOW << 8) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) opts->ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) if (unlikely(opts->num_sack_blocks)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) struct tcp_sack_block *sp = tp->rx_opt.dsack ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) tp->duplicate_sack : tp->selective_acks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) int this_sack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) *ptr++ = htonl((TCPOPT_NOP << 24) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) (TCPOPT_NOP << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) (TCPOPT_SACK << 8) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) TCPOLEN_SACK_PERBLOCK)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) for (this_sack = 0; this_sack < opts->num_sack_blocks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) ++this_sack) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) *ptr++ = htonl(sp[this_sack].start_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) *ptr++ = htonl(sp[this_sack].end_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) tp->rx_opt.dsack = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) u8 *p = (u8 *)ptr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) u32 len; /* Fast Open option length */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) if (foc->exp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) TCPOPT_FASTOPEN_MAGIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) p += TCPOLEN_EXP_FASTOPEN_BASE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) len = TCPOLEN_FASTOPEN_BASE + foc->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) *p++ = TCPOPT_FASTOPEN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) *p++ = len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) memcpy(p, foc->val, foc->len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) if ((len & 3) == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) p[foc->len] = TCPOPT_NOP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) p[foc->len + 1] = TCPOPT_NOP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) ptr += (len + 3) >> 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) smc_options_write(ptr, &options);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) mptcp_options_write(ptr, opts);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) static void smc_set_option(const struct tcp_sock *tp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) struct tcp_out_options *opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) unsigned int *remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) #if IS_ENABLED(CONFIG_SMC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) if (static_branch_unlikely(&tcp_have_smc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) if (tp->syn_smc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) opts->options |= OPTION_SMC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) static void smc_set_option_cond(const struct tcp_sock *tp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) const struct inet_request_sock *ireq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) struct tcp_out_options *opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) unsigned int *remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) #if IS_ENABLED(CONFIG_SMC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) if (static_branch_unlikely(&tcp_have_smc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) if (tp->syn_smc && ireq->smc_ok) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) opts->options |= OPTION_SMC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) static void mptcp_set_option_cond(const struct request_sock *req,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) struct tcp_out_options *opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) unsigned int *remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) if (rsk_is_mptcp(req)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) unsigned int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) if (mptcp_synack_options(req, &size, &opts->mptcp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) if (*remaining >= size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) opts->options |= OPTION_MPTCP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) *remaining -= size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) /* Compute TCP options for SYN packets. This is not the final
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) * network wire format yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) struct tcp_out_options *opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) struct tcp_md5sig_key **md5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) unsigned int remaining = MAX_TCP_OPTION_SPACE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) struct tcp_fastopen_request *fastopen = tp->fastopen_req;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) *md5 = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) #ifdef CONFIG_TCP_MD5SIG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) if (static_branch_unlikely(&tcp_md5_needed) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) rcu_access_pointer(tp->md5sig_info)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) *md5 = tp->af_specific->md5_lookup(sk, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) if (*md5) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) opts->options |= OPTION_MD5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) remaining -= TCPOLEN_MD5SIG_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) /* We always get an MSS option. The option bytes which will be seen in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) * normal data packets should timestamps be used, must be in the MSS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) * advertised. But we subtract them from tp->mss_cache so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) * calculations in tcp_sendmsg are simpler etc. So account for this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) * fact here if necessary. If we don't do this correctly, as a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) * receiver we won't recognize data packets as being full sized when we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) * should, and thus we won't abide by the delayed ACK rules correctly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) * SACKs don't matter, we never delay an ACK when we have any of those
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) * going out. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) opts->mss = tcp_advertise_mss(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) remaining -= TCPOLEN_MSS_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) opts->options |= OPTION_TS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) opts->tsecr = tp->rx_opt.ts_recent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) remaining -= TCPOLEN_TSTAMP_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) opts->ws = tp->rx_opt.rcv_wscale;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) opts->options |= OPTION_WSCALE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) remaining -= TCPOLEN_WSCALE_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) opts->options |= OPTION_SACK_ADVERTISE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) if (unlikely(!(OPTION_TS & opts->options)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) remaining -= TCPOLEN_SACKPERM_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) if (fastopen && fastopen->cookie.len >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) u32 need = fastopen->cookie.len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) TCPOLEN_FASTOPEN_BASE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) need = (need + 3) & ~3U; /* Align to 32 bits */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) if (remaining >= need) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) opts->options |= OPTION_FAST_OPEN_COOKIE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) opts->fastopen_cookie = &fastopen->cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) remaining -= need;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) tp->syn_fastopen = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) smc_set_option(tp, opts, &remaining);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) if (sk_is_mptcp(sk)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) unsigned int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) opts->options |= OPTION_MPTCP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) remaining -= size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) return MAX_TCP_OPTION_SPACE - remaining;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) /* Set up TCP options for SYN-ACKs. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) static unsigned int tcp_synack_options(const struct sock *sk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) struct request_sock *req,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) unsigned int mss, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) struct tcp_out_options *opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) const struct tcp_md5sig_key *md5,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) struct tcp_fastopen_cookie *foc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) enum tcp_synack_type synack_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) struct sk_buff *syn_skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) struct inet_request_sock *ireq = inet_rsk(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) unsigned int remaining = MAX_TCP_OPTION_SPACE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) #ifdef CONFIG_TCP_MD5SIG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) if (md5) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) opts->options |= OPTION_MD5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) remaining -= TCPOLEN_MD5SIG_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) /* We can't fit any SACK blocks in a packet with MD5 + TS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) * options. There was discussion about disabling SACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) * rather than TS in order to fit in better with old,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) * buggy kernels, but that was deemed to be unnecessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) if (synack_type != TCP_SYNACK_COOKIE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) ireq->tstamp_ok &= !ireq->sack_ok;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) /* We always send an MSS option. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) opts->mss = mss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) remaining -= TCPOLEN_MSS_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) if (likely(ireq->wscale_ok)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) opts->ws = ireq->rcv_wscale;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) opts->options |= OPTION_WSCALE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) remaining -= TCPOLEN_WSCALE_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) if (likely(ireq->tstamp_ok)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) opts->options |= OPTION_TS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) opts->tsecr = req->ts_recent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) remaining -= TCPOLEN_TSTAMP_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) if (likely(ireq->sack_ok)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) opts->options |= OPTION_SACK_ADVERTISE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) if (unlikely(!ireq->tstamp_ok))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) remaining -= TCPOLEN_SACKPERM_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) if (foc != NULL && foc->len >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) u32 need = foc->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) TCPOLEN_FASTOPEN_BASE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) need = (need + 3) & ~3U; /* Align to 32 bits */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) if (remaining >= need) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) opts->options |= OPTION_FAST_OPEN_COOKIE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) opts->fastopen_cookie = foc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) remaining -= need;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) mptcp_set_option_cond(req, opts, &remaining);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) synack_type, opts, &remaining);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) return MAX_TCP_OPTION_SPACE - remaining;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) /* Compute TCP options for ESTABLISHED sockets. This is not the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) * final wire format yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) struct tcp_out_options *opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) struct tcp_md5sig_key **md5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) unsigned int size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) unsigned int eff_sacks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) opts->options = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) *md5 = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) #ifdef CONFIG_TCP_MD5SIG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) if (static_branch_unlikely(&tcp_md5_needed) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) rcu_access_pointer(tp->md5sig_info)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) *md5 = tp->af_specific->md5_lookup(sk, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) if (*md5) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) opts->options |= OPTION_MD5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) size += TCPOLEN_MD5SIG_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) if (likely(tp->rx_opt.tstamp_ok)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) opts->options |= OPTION_TS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) opts->tsecr = tp->rx_opt.ts_recent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) size += TCPOLEN_TSTAMP_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) /* MPTCP options have precedence over SACK for the limited TCP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) * option space because a MPTCP connection would be forced to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) * fall back to regular TCP if a required multipath option is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) * missing. SACK still gets a chance to use whatever space is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) * left.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) if (sk_is_mptcp(sk)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) unsigned int opt_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) if (mptcp_established_options(sk, skb, &opt_size, remaining,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) &opts->mptcp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) opts->options |= OPTION_MPTCP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) size += opt_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) if (unlikely(eff_sacks)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) TCPOLEN_SACK_PERBLOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) opts->num_sack_blocks =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) min_t(unsigned int, eff_sacks,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) TCPOLEN_SACK_PERBLOCK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) size += TCPOLEN_SACK_BASE_ALIGNED +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) size = MAX_TCP_OPTION_SPACE - remaining;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) /* TCP SMALL QUEUES (TSQ)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) * to reduce RTT and bufferbloat.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) * We do this using a special skb destructor (tcp_wfree).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) * needs to be reallocated in a driver.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) * Since transmit from skb destructor is forbidden, we use a tasklet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) * to process all sockets that eventually need to send more skbs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * We use one tasklet per cpu, with its own queue of sockets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) struct tsq_tasklet {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) struct tasklet_struct tasklet;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) struct list_head head; /* queue of tcp sockets */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) static void tcp_tsq_write(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) if ((1 << sk->sk_state) &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) if (tp->lost_out > tp->retrans_out &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) tp->snd_cwnd > tcp_packets_in_flight(tp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) tcp_mstamp_refresh(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) tcp_xmit_retransmit_queue(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) 0, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) static void tcp_tsq_handler(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) bh_lock_sock(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) if (!sock_owned_by_user(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) tcp_tsq_write(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) sock_hold(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) bh_unlock_sock(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) * One tasklet per cpu tries to send more skbs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) * We run in tasklet context but need to disable irqs when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) * transferring tsq->head because tcp_wfree() might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) * interrupt us (non NAPI drivers)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) static void tcp_tasklet_func(unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) LIST_HEAD(list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) struct list_head *q, *n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) struct tcp_sock *tp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) struct sock *sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) list_splice_init(&tsq->head, &list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) list_for_each_safe(q, n, &list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) tp = list_entry(q, struct tcp_sock, tsq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) list_del(&tp->tsq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) sk = (struct sock *)tp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) smp_mb__before_atomic();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) tcp_tsq_handler(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) sk_free(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) TCPF_WRITE_TIMER_DEFERRED | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) TCPF_DELACK_TIMER_DEFERRED | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) TCPF_MTU_REDUCED_DEFERRED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) * tcp_release_cb - tcp release_sock() callback
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) * @sk: socket
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) * called from release_sock() to perform protocol dependent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) * actions before socket release.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) void tcp_release_cb(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) unsigned long flags, nflags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) /* perform an atomic operation only if at least one flag is set */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) flags = sk->sk_tsq_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) if (!(flags & TCP_DEFERRED_ALL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) nflags = flags & ~TCP_DEFERRED_ALL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) if (flags & TCPF_TSQ_DEFERRED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) tcp_tsq_write(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) __sock_put(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) /* Here begins the tricky part :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) * We are called from release_sock() with :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) * 1) BH disabled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) * 2) sk_lock.slock spinlock held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) * 3) socket owned by us (sk->sk_lock.owned == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) * But following code is meant to be called from BH handlers,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) * so we should keep BH disabled, but early release socket ownership
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) sock_release_ownership(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) if (flags & TCPF_WRITE_TIMER_DEFERRED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) tcp_write_timer_handler(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) __sock_put(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) if (flags & TCPF_DELACK_TIMER_DEFERRED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) tcp_delack_timer_handler(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) __sock_put(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) if (flags & TCPF_MTU_REDUCED_DEFERRED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) __sock_put(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) EXPORT_SYMBOL(tcp_release_cb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) void __init tcp_tasklet_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) for_each_possible_cpu(i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) INIT_LIST_HEAD(&tsq->head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) tasklet_init(&tsq->tasklet,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) tcp_tasklet_func,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) (unsigned long)tsq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) * Write buffer destructor automatically called from kfree_skb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) * We can't xmit new skbs from this context, as we might already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) * hold qdisc lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) void tcp_wfree(struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) struct sock *sk = skb->sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) unsigned long flags, nval, oval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) /* Keep one reference on sk_wmem_alloc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) * Will be released by sk_free() from here or tcp_tasklet_func()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) /* If this softirq is serviced by ksoftirqd, we are likely under stress.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) * Wait until our queues (qdisc + devices) are drained.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) * This gives :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) * - less callbacks to tcp_write_xmit(), reducing stress (batches)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) * - chance for incoming ACK (processed by another cpu maybe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) * to migrate this flow (skb->ooo_okay will be eventually set)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) struct tsq_tasklet *tsq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) bool empty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) if (nval != oval)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) /* queue this socket to tasklet queue */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) tsq = this_cpu_ptr(&tsq_tasklet);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) empty = list_empty(&tsq->head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) list_add(&tp->tsq_node, &tsq->head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) if (empty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) tasklet_schedule(&tsq->tasklet);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) sk_free(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) /* Note: Called under soft irq.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) * We can call TCP stack right away, unless socket is owned by user.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) struct sock *sk = (struct sock *)tp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) tcp_tsq_handler(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) sock_put(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) return HRTIMER_NORESTART;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) u64 prior_wstamp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) if (sk->sk_pacing_status != SK_PACING_NONE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) unsigned long rate = sk->sk_pacing_rate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) /* Original sch_fq does not pace first 10 MSS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) * Note that tp->data_segs_out overflows after 2^32 packets,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) * this is a minor annoyance.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) /* take into account OS jitter */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) len_ns -= min_t(u64, len_ns / 2, credit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) tp->tcp_wstamp_ns += len_ns;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) /* This routine actually transmits TCP packets queued in by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) * tcp_do_sendmsg(). This is used by both the initial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) * transmission and possible later retransmissions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) * All SKB's seen here are completely headerless. It is our
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) * job to build the TCP header, and pass the packet down to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) * IP so it can do the same plus pass the packet off to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) * device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) * We are working here with either a clone of the original
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) * SKB, or a fresh unique copy made by the retransmit engine.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) const struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) struct inet_sock *inet;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) struct tcp_sock *tp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) struct tcp_skb_cb *tcb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) struct tcp_out_options opts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) unsigned int tcp_options_size, tcp_header_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) struct sk_buff *oskb = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) struct tcp_md5sig_key *md5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) struct tcphdr *th;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) u64 prior_wstamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) BUG_ON(!skb || !tcp_skb_pcount(skb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) prior_wstamp = tp->tcp_wstamp_ns;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) if (clone_it) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) - tp->snd_una;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) oskb = skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) tcp_skb_tsorted_save(oskb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) if (unlikely(skb_cloned(oskb)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) skb = pskb_copy(oskb, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) skb = skb_clone(oskb, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) } tcp_skb_tsorted_restore(oskb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) if (unlikely(!skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) return -ENOBUFS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) /* retransmit skbs might have a non zero value in skb->dev
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) * because skb->dev is aliased with skb->rbnode.rb_left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) skb->dev = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) inet = inet_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) tcb = TCP_SKB_CB(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) memset(&opts, 0, sizeof(opts));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) tcp_options_size = tcp_established_options(sk, skb, &opts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) &md5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) /* Force a PSH flag on all (GSO) packets to expedite GRO flush
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) * at receiver : This slightly improve GRO performance.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) * Note that we do not force the PSH flag for non GSO packets,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) * because they might be sent under high congestion events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) * and in this case it is better to delay the delivery of 1-MSS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) * packets and thus the corresponding ACK packet that would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) * release the following packet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) if (tcp_skb_pcount(skb) > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) tcb->tcp_flags |= TCPHDR_PSH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) /* if no packet is in qdisc/device queue, then allow XPS to select
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) * another queue. We can be called from tcp_tsq_handler()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) * which holds one reference to sk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) * TODO: Ideally, in-flight pure ACK packets should not matter here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) * One way to get this would be to set skb->truesize = 2 on them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) /* If we had to use memory reserve to allocate this skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) * this might cause drops if packet is looped back :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) * Other socket might not have SOCK_MEMALLOC.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) * Packets not looped back do not care about pfmemalloc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) skb->pfmemalloc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) skb_push(skb, tcp_header_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) skb_reset_transport_header(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) skb_orphan(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) skb->sk = sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) skb_set_hash_from_sk(skb, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) refcount_add(skb->truesize, &sk->sk_wmem_alloc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) /* Build TCP header and checksum it. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) th = (struct tcphdr *)skb->data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) th->source = inet->inet_sport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) th->dest = inet->inet_dport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) th->seq = htonl(tcb->seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) th->ack_seq = htonl(rcv_nxt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) tcb->tcp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) th->check = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) th->urg_ptr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) /* The urg_mode check is necessary during a below snd_una win probe */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) if (before(tp->snd_up, tcb->seq + 0x10000)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) th->urg_ptr = htons(tp->snd_up - tcb->seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) th->urg = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) th->urg_ptr = htons(0xFFFF);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) th->urg = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) tcp_options_write((__be32 *)(th + 1), tp, &opts);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) skb_shinfo(skb)->gso_type = sk->sk_gso_type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) th->window = htons(tcp_select_window(sk));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) tcp_ecn_send(sk, skb, th, tcp_header_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) /* RFC1323: The window in SYN & SYN/ACK segments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) * is never scaled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) th->window = htons(min(tp->rcv_wnd, 65535U));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) #ifdef CONFIG_TCP_MD5SIG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) /* Calculate the MD5 hash, as we have all we need now */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) if (md5) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) sk_nocaps_add(sk, NETIF_F_GSO_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) tp->af_specific->calc_md5_hash(opts.hash_location,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) md5, sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) /* BPF prog is the last one writing header option */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) tcp_v6_send_check, tcp_v4_send_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) if (likely(tcb->tcp_flags & TCPHDR_ACK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) if (skb->len != tcp_header_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) tcp_event_data_sent(tp, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) tp->data_segs_out += tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) tp->bytes_sent += skb->len - tcp_header_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) tcp_skb_pcount(skb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) tp->segs_out += tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) /* Cleanup our debris for IP stacks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) sizeof(struct inet6_skb_parm)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) tcp_add_tx_delay(skb, tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) inet6_csk_xmit, ip_queue_xmit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) sk, skb, &inet->cork.fl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) if (unlikely(err > 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) tcp_enter_cwr(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) err = net_xmit_eval(err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) if (!err && oskb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) tcp_update_skb_after_send(sk, oskb, prior_wstamp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) tcp_rate_skb_sent(sk, oskb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) tcp_sk(sk)->rcv_nxt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) /* This routine just queues the buffer for sending.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) * otherwise socket can stall.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) /* Advance write_seq and place onto the write_queue. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) __skb_header_release(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) tcp_add_write_queue_tail(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) sk_wmem_queued_add(sk, skb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) sk_mem_charge(sk, skb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) /* Initialize TSO segments for a packet. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) if (skb->len <= mss_now) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) /* Avoid the costly divide in the normal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) * non-TSO case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) tcp_skb_pcount_set(skb, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) TCP_SKB_CB(skb)->tcp_gso_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) /* Pcount in the middle of the write queue got changed, we need to do various
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) * tweaks to fix counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) tp->packets_out -= decr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) tp->sacked_out -= decr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) tp->retrans_out -= decr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) tp->lost_out -= decr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) /* Reno case is special. Sigh... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) if (tcp_is_reno(tp) && decr > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) if (tp->lost_skb_hint &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) tp->lost_cnt_hint -= decr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) tcp_verify_left_out(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) return TCP_SKB_CB(skb)->txstamp_ack ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) struct skb_shared_info *shinfo = skb_shinfo(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) if (unlikely(tcp_has_tx_tstamp(skb)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) shinfo->tx_flags &= ~tsflags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) shinfo2->tx_flags |= tsflags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) swap(shinfo->tskey, shinfo2->tskey);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) TCP_SKB_CB(skb)->txstamp_ack = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) TCP_SKB_CB(skb)->eor = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) /* Insert buff after skb on the write or rtx queue of sk. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) static void tcp_insert_write_queue_after(struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) struct sk_buff *buff,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) struct sock *sk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) enum tcp_queue tcp_queue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) __skb_queue_after(&sk->sk_write_queue, skb, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) /* Function to create two new TCP segments. Shrinks the given segment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) * to the specified size and appends a new segment with the rest of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) * packet to the list. This won't be called frequently, I hope.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) * Remember, these are still headerless SKBs at this point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) struct sk_buff *skb, u32 len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) unsigned int mss_now, gfp_t gfp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) struct sk_buff *buff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) int nsize, old_factor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) long limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) int nlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) u8 flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) if (WARN_ON(len > skb->len))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) nsize = skb_headlen(skb) - len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) if (nsize < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) nsize = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) * We need some allowance to not penalize applications setting small
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) * SO_SNDBUF values.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) * Also allow first and last skb in retransmit queue to be split.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) skb != tcp_rtx_queue_head(sk) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) skb != tcp_rtx_queue_tail(sk))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) if (skb_unclone_keeptruesize(skb, gfp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) /* Get a new skb... force flag on. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) if (!buff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) return -ENOMEM; /* We'll just try again later. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) skb_copy_decrypted(buff, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) sk_wmem_queued_add(sk, buff->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) sk_mem_charge(sk, buff->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) nlen = skb->len - len - nsize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) buff->truesize += nlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) skb->truesize -= nlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) /* Correct the sequence numbers. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) /* PSH and FIN should only be set in the second packet. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) flags = TCP_SKB_CB(skb)->tcp_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) TCP_SKB_CB(buff)->tcp_flags = flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) tcp_skb_fragment_eor(skb, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) skb_split(skb, buff, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) buff->ip_summed = CHECKSUM_PARTIAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) buff->tstamp = skb->tstamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) tcp_fragment_tstamp(skb, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) old_factor = tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) /* Fix up tso_factor for both original and new SKB. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) tcp_set_skb_tso_segs(skb, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) tcp_set_skb_tso_segs(buff, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) /* Update delivered info for the new segment */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) /* If this packet has been sent out already, we must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) * adjust the various packet counters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) int diff = old_factor - tcp_skb_pcount(skb) -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) tcp_skb_pcount(buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) if (diff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) tcp_adjust_pcount(sk, skb, diff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) /* Link BUFF into the send queue. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) __skb_header_release(buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) /* This is similar to __pskb_pull_tail(). The difference is that pulled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) * data is not copied, but immediately discarded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) static int __pskb_trim_head(struct sk_buff *skb, int len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) struct skb_shared_info *shinfo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) int i, k, eat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) eat = min_t(int, len, skb_headlen(skb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) if (eat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) __skb_pull(skb, eat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) len -= eat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) if (!len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) eat = len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) k = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) shinfo = skb_shinfo(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) for (i = 0; i < shinfo->nr_frags; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) int size = skb_frag_size(&shinfo->frags[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) if (size <= eat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) skb_frag_unref(skb, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) eat -= size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) shinfo->frags[k] = shinfo->frags[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) if (eat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) skb_frag_off_add(&shinfo->frags[k], eat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) skb_frag_size_sub(&shinfo->frags[k], eat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) eat = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) k++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) shinfo->nr_frags = k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) skb->data_len -= len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) skb->len = skb->data_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) return len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) /* Remove acked data from a packet in the transmit queue. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) u32 delta_truesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) delta_truesize = __pskb_trim_head(skb, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) TCP_SKB_CB(skb)->seq += len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) skb->ip_summed = CHECKSUM_PARTIAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) if (delta_truesize) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) skb->truesize -= delta_truesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) sk_wmem_queued_add(sk, -delta_truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) sk_mem_uncharge(sk, delta_truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) /* Any change of skb->len requires recalculation of tso factor. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) if (tcp_skb_pcount(skb) > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) /* Calculate MSS not accounting any TCP options. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) const struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) const struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) int mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) /* Calculate base mss without TCP options:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) It is MMS_S - sizeof(tcphdr) of rfc1122
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) if (icsk->icsk_af_ops->net_frag_header_len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) const struct dst_entry *dst = __sk_dst_get(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) if (dst && dst_allfrag(dst))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) mss_now -= icsk->icsk_af_ops->net_frag_header_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) /* Clamp it (mss_clamp does not include tcp options) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) if (mss_now > tp->rx_opt.mss_clamp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) mss_now = tp->rx_opt.mss_clamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) /* Now subtract optional transport overhead */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) mss_now -= icsk->icsk_ext_hdr_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) /* Then reserve room for full set of TCP options and 8 bytes of data */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) return mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) /* Calculate MSS. Not accounting for SACKs here. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) int tcp_mtu_to_mss(struct sock *sk, int pmtu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) /* Subtract TCP options size, not including SACKs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) return __tcp_mtu_to_mss(sk, pmtu) -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) EXPORT_SYMBOL(tcp_mtu_to_mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) /* Inverse of above */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) int tcp_mss_to_mtu(struct sock *sk, int mss)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) const struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) const struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) int mtu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) mtu = mss +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) tp->tcp_header_len +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) icsk->icsk_ext_hdr_len +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) icsk->icsk_af_ops->net_header_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) if (icsk->icsk_af_ops->net_frag_header_len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) const struct dst_entry *dst = __sk_dst_get(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) if (dst && dst_allfrag(dst))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) mtu += icsk->icsk_af_ops->net_frag_header_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) return mtu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) EXPORT_SYMBOL(tcp_mss_to_mtu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) /* MTU probing init per socket */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) void tcp_mtup_init(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) struct net *net = sock_net(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) icsk->icsk_af_ops->net_header_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) icsk->icsk_mtup.probe_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) if (icsk->icsk_mtup.enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) EXPORT_SYMBOL(tcp_mtup_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) /* This function synchronize snd mss to current pmtu/exthdr set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) for TCP options, but includes only bare TCP header.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) tp->rx_opt.mss_clamp is mss negotiated at connection setup.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) It is minimum of user_mss and mss received with SYN.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) It also does not include TCP options.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) tp->mss_cache is current effective sending mss, including
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) all tcp options except for SACKs. It is evaluated,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) taking into account current pmtu, but never exceeds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) tp->rx_opt.mss_clamp.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) NOTE1. rfc1122 clearly states that advertised MSS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) DOES NOT include either tcp or ip options.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) are READ ONLY outside this function. --ANK (980731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) int mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) if (icsk->icsk_mtup.search_high > pmtu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) icsk->icsk_mtup.search_high = pmtu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) mss_now = tcp_mtu_to_mss(sk, pmtu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) mss_now = tcp_bound_to_half_wnd(tp, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) /* And store cached results */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) icsk->icsk_pmtu_cookie = pmtu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) if (icsk->icsk_mtup.enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) tp->mss_cache = mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) return mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) EXPORT_SYMBOL(tcp_sync_mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) /* Compute the current effective MSS, taking SACKs and IP options,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) * and even PMTU discovery events into account.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) unsigned int tcp_current_mss(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) const struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) const struct dst_entry *dst = __sk_dst_get(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) u32 mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) unsigned int header_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) struct tcp_out_options opts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) struct tcp_md5sig_key *md5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) mss_now = tp->mss_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) if (dst) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) u32 mtu = dst_mtu(dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) mss_now = tcp_sync_mss(sk, mtu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) header_len = tcp_established_options(sk, NULL, &opts, &md5) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) sizeof(struct tcphdr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) /* The mss_cache is sized based on tp->tcp_header_len, which assumes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) * some common options. If this is an odd packet (because we have SACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) * blocks etc) then our calculated header_len will be different, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) * we have to adjust mss_now correspondingly */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) if (header_len != tp->tcp_header_len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) int delta = (int) header_len - tp->tcp_header_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) mss_now -= delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) return mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) * As additional protections, we do not touch cwnd in retransmission phases,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) * and if application hit its sndbuf limit recently.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) static void tcp_cwnd_application_limited(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) /* Limited by application or receiver window. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) u32 win_used = max(tp->snd_cwnd_used, init_win);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) if (win_used < tp->snd_cwnd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) tp->snd_ssthresh = tcp_current_ssthresh(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) tp->snd_cwnd_used = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) tp->snd_cwnd_stamp = tcp_jiffies32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) /* Track the maximum number of outstanding packets in each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) * window, and remember whether we were cwnd-limited then.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) if (!before(tp->snd_una, tp->max_packets_seq) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) tp->packets_out > tp->max_packets_out ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) is_cwnd_limited) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) tp->max_packets_out = tp->packets_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) tp->max_packets_seq = tp->snd_nxt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) tp->is_cwnd_limited = is_cwnd_limited;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) if (tcp_is_cwnd_limited(sk)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) /* Network is feed fully. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) tp->snd_cwnd_used = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) tp->snd_cwnd_stamp = tcp_jiffies32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) /* Network starves. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) if (tp->packets_out > tp->snd_cwnd_used)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) tp->snd_cwnd_used = tp->packets_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) !ca_ops->cong_control)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) tcp_cwnd_application_limited(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) /* The following conditions together indicate the starvation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) * is caused by insufficient sender buffer:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) * 1) just sent some data (see tcp_write_xmit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) * 2) not cwnd limited (this else condition)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) * 3) no more data to send (tcp_write_queue_empty())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) * 4) application is hitting buffer limit (SOCK_NOSPACE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) if (tcp_write_queue_empty(sk) && sk->sk_socket &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) /* Minshall's variant of the Nagle send check. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) static bool tcp_minshall_check(const struct tcp_sock *tp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) return after(tp->snd_sml, tp->snd_una) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) !after(tp->snd_sml, tp->snd_nxt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) /* Update snd_sml if this skb is under mss
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) * Note that a TSO packet might end with a sub-mss segment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) * The test is really :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) * if ((skb->len % mss) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) * tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) * But we can avoid doing the divide again given we already have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) * skb_pcount = skb->len / mss_now
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) const struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) if (skb->len < tcp_skb_pcount(skb) * mss_now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) /* Return false, if packet can be sent now without violation Nagle's rules:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) * 1. It is full sized. (provided by caller in %partial bool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) * 2. Or it contains FIN. (already checked by caller)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) * With Minshall's modification: all sent small packets are ACKed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) int nonagle)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) return partial &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) ((nonagle & TCP_NAGLE_CORK) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) /* Return how many segs we'd like on a TSO packet,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) * to send one TSO packet per ms
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) int min_tso_segs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) u32 bytes, segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) bytes = min_t(unsigned long,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) /* Goal is to send at least one packet per ms,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) * not one big TSO packet every 100 ms.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) * This preserves ACK clocking and is consistent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) * with tcp_tso_should_defer() heuristic.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) segs = max_t(u32, bytes / mss_now, min_tso_segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) return segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) /* Return the number of segments we want in the skb we are transmitting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) * See if congestion control module wants to decide; otherwise, autosize.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) u32 min_tso, tso_segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) min_tso = ca_ops->min_tso_segs ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) ca_ops->min_tso_segs(sk) :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) return min_t(u32, tso_segs, sk->sk_gso_max_segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) /* Returns the portion of skb which can be sent right away */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) static unsigned int tcp_mss_split_point(const struct sock *sk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) const struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) unsigned int mss_now,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) unsigned int max_segs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) int nonagle)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) const struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) u32 partial, needed, window, max_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) max_len = mss_now * max_segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) return max_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) needed = min(skb->len, window);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) if (max_len <= needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) return max_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) partial = needed % mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) /* If last segment is not a full MSS, check if Nagle rules allow us
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) * to include this last segment in this skb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) * Otherwise, we'll split the skb at last MSS boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) if (tcp_nagle_check(partial != 0, tp, nonagle))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) return needed - partial;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) return needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) /* Can at least one segment of SKB be sent right now, according to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) * congestion window rules? If so, return how many segments are allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) const struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) u32 in_flight, cwnd, halfcwnd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) /* Don't be strict about the congestion window for the final FIN. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) tcp_skb_pcount(skb) == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) in_flight = tcp_packets_in_flight(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) cwnd = tp->snd_cwnd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) if (in_flight >= cwnd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) /* For better scheduling, ensure we have at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) * 2 GSO packets in flight.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) halfcwnd = max(cwnd >> 1, 1U);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) return min(halfcwnd, cwnd - in_flight);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) /* Initialize TSO state of a skb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) * This must be invoked the first time we consider transmitting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) * SKB onto the wire.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) int tso_segs = tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) tcp_set_skb_tso_segs(skb, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) tso_segs = tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) return tso_segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) /* Return true if the Nagle test allows this packet to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) * sent now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) unsigned int cur_mss, int nonagle)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) /* Nagle rule does not apply to frames, which sit in the middle of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) * write_queue (they have no chances to get new data).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) * This is implemented in the callers, where they modify the 'nonagle'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) * argument based upon the location of SKB in the send queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) if (nonagle & TCP_NAGLE_PUSH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) /* Don't use the nagle rule for urgent data (or for the final FIN). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) /* Does at least the first segment of SKB fit into the send window? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) const struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) unsigned int cur_mss)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) u32 end_seq = TCP_SKB_CB(skb)->end_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) if (skb->len > cur_mss)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) return !after(end_seq, tcp_wnd_end(tp));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) * which is put after SKB on the list. It is very much like
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) * tcp_fragment() except that it may make several kinds of assumptions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) * in order to speed up the splitting operation. In particular, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) * know that all the data is in scatter-gather pages, and that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) * packet has never been sent out before (and thus is not cloned).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) unsigned int mss_now, gfp_t gfp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) int nlen = skb->len - len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) struct sk_buff *buff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) u8 flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) /* All of a TSO frame must be composed of paged data. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) if (skb->len != skb->data_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) skb, len, mss_now, gfp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) buff = sk_stream_alloc_skb(sk, 0, gfp, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) if (unlikely(!buff))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) skb_copy_decrypted(buff, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) sk_wmem_queued_add(sk, buff->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) sk_mem_charge(sk, buff->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) buff->truesize += nlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) skb->truesize -= nlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) /* Correct the sequence numbers. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) /* PSH and FIN should only be set in the second packet. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) flags = TCP_SKB_CB(skb)->tcp_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) TCP_SKB_CB(buff)->tcp_flags = flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) /* This packet was never sent out yet, so no SACK bits. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) TCP_SKB_CB(buff)->sacked = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) tcp_skb_fragment_eor(skb, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) buff->ip_summed = CHECKSUM_PARTIAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) skb_split(skb, buff, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) tcp_fragment_tstamp(skb, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) /* Fix up tso_factor for both original and new SKB. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) tcp_set_skb_tso_segs(skb, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) tcp_set_skb_tso_segs(buff, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) /* Link BUFF into the send queue. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) __skb_header_release(buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) /* Try to defer sending, if possible, in order to minimize the amount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) * of TSO splitting we do. View it as a kind of TSO Nagle test.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) * This algorithm is from John Heffner.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) bool *is_cwnd_limited,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) bool *is_rwnd_limited,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) u32 max_segs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) const struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) u32 send_win, cong_win, limit, in_flight;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) struct sk_buff *head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) int win_divisor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) s64 delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) if (icsk->icsk_ca_state >= TCP_CA_Recovery)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) /* Avoid bursty behavior by allowing defer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) * only if the last write was recent (1 ms).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) * Note that tp->tcp_wstamp_ns can be in the future if we have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) * packets waiting in a qdisc or device for EDT delivery.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) if (delta > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) in_flight = tcp_packets_in_flight(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) BUG_ON(tcp_skb_pcount(skb) <= 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) BUG_ON(tp->snd_cwnd <= in_flight);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) /* From in_flight test above, we know that cwnd > in_flight. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) limit = min(send_win, cong_win);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) /* If a full-sized TSO skb can be sent, do it. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) if (limit >= max_segs * tp->mss_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) /* Middle in queue won't get any more data, full sendable already? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) if (win_divisor) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) /* If at least some fraction of a window is available,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) * just use it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) chunk /= win_divisor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) if (limit >= chunk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) /* Different approach, try not to defer past a single
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) * ACK. Receiver should ACK every other full sized
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) * frame, so if we have space for more than 3 frames
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) * then send now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) /* TODO : use tsorted_sent_queue ? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) head = tcp_rtx_queue_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) if (!head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) delta = tp->tcp_clock_cache - head->tstamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) /* If next ACK is likely to come too late (half srtt), do not defer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) /* Ok, it looks like it is advisable to defer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) * Three cases are tracked :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) * 1) We are cwnd-limited
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) * 2) We are rwnd-limited
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) * 3) We are application limited.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) if (cong_win < send_win) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) if (cong_win <= skb->len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) *is_cwnd_limited = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) if (send_win <= skb->len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) *is_rwnd_limited = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) /* If this packet won't get more data, do not wait. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) TCP_SKB_CB(skb)->eor)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) goto send_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) send_now:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) static inline void tcp_mtu_check_reprobe(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) struct net *net = sock_net(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) u32 interval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) s32 delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) interval = net->ipv4.sysctl_tcp_probe_interval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) if (unlikely(delta >= interval * HZ)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) int mss = tcp_current_mss(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) /* Update current search range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) icsk->icsk_mtup.probe_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) sizeof(struct tcphdr) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) icsk->icsk_af_ops->net_header_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) /* Update probe time stamp */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) struct sk_buff *skb, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) skb = tcp_send_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) tcp_for_write_queue_from_safe(skb, next, sk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) if (len <= skb->len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) len -= skb->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) /* Create a new MTU probe if we are ready.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) * MTU probe is regularly attempting to increase the path MTU by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) * deliberately sending larger packets. This discovers routing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) * changes resulting in larger path MTUs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) * Returns 0 if we should wait to probe (no cwnd available),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) * 1 if a probe was sent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) * -1 otherwise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) static int tcp_mtu_probe(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) struct sk_buff *skb, *nskb, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) struct net *net = sock_net(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) int probe_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) int size_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) int copy, len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) int mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) int interval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) /* Not currently probing/verifying,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) * not in recovery,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) * have enough cwnd, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) * not SACKing (the variable headers throw things off)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) if (likely(!icsk->icsk_mtup.enabled ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) icsk->icsk_mtup.probe_size ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) tp->snd_cwnd < 11 ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) tp->rx_opt.num_sacks || tp->rx_opt.dsack))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) /* Use binary search for probe_size between tcp_mss_base,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) * and current mss_clamp. if (search_high - search_low)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) * smaller than a threshold, backoff from probing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) mss_now = tcp_current_mss(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) icsk->icsk_mtup.search_low) >> 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) /* When misfortune happens, we are reprobing actively,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) * and then reprobe timer has expired. We stick with current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) * probing process by not resetting search range to its orignal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) interval < net->ipv4.sysctl_tcp_probe_threshold) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) /* Check whether enough time has elaplased for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) * another round of probing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) tcp_mtu_check_reprobe(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) /* Have enough data in the send queue to probe? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) if (tp->write_seq - tp->snd_nxt < size_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) if (tp->snd_wnd < size_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) /* Do we need to wait to drain cwnd? With none in flight, don't stall */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) if (!tcp_packets_in_flight(tp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) /* We're allowed to probe. Build it now. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) if (!nskb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) sk_wmem_queued_add(sk, nskb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) sk_mem_charge(sk, nskb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) skb = tcp_send_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) skb_copy_decrypted(nskb, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) TCP_SKB_CB(nskb)->sacked = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) nskb->csum = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) nskb->ip_summed = CHECKSUM_PARTIAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) tcp_insert_write_queue_before(nskb, skb, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) tcp_highest_sack_replace(sk, skb, nskb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) len = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) tcp_for_write_queue_from_safe(skb, next, sk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) copy = min_t(int, skb->len, probe_size - len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) if (skb->len <= copy) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) /* We've eaten all the data from this skb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) * Throw it away. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) /* If this is the last SKB we copy and eor is set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) * we need to propagate it to the new skb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) tcp_skb_collapse_tstamp(nskb, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) tcp_unlink_write_queue(skb, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) sk_wmem_free_skb(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) ~(TCPHDR_FIN|TCPHDR_PSH);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) if (!skb_shinfo(skb)->nr_frags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) skb_pull(skb, copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) __pskb_trim_head(skb, copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) tcp_set_skb_tso_segs(skb, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) TCP_SKB_CB(skb)->seq += copy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) len += copy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) if (len >= probe_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) tcp_init_tso_segs(nskb, nskb->len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) /* We're ready to send. If this fails, the probe will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) * be resegmented into mss-sized pieces by tcp_write_xmit().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) /* Decrement cwnd here because we are sending
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) * effectively two packets. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) tp->snd_cwnd--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) tcp_event_new_data_sent(sk, nskb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) static bool tcp_pacing_check(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) if (!tcp_needs_internal_pacing(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) if (!hrtimer_is_queued(&tp->pacing_timer)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) hrtimer_start(&tp->pacing_timer,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) ns_to_ktime(tp->tcp_wstamp_ns),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) HRTIMER_MODE_ABS_PINNED_SOFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) sock_hold(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) /* TCP Small Queues :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) * Control number of packets in qdisc/devices to two packets / or ~1 ms.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) * (These limits are doubled for retransmits)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) * This allows for :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) * - better RTT estimation and ACK scheduling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) * - faster recovery
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) * - high rates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) * Alas, some drivers / subsystems require a fair amount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) * of queued bytes to ensure line rate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) * One example is wifi aggregation (802.11 AMPDU)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) unsigned int factor)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) unsigned long limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) limit = max_t(unsigned long,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) 2 * skb->truesize,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) if (sk->sk_pacing_status == SK_PACING_NONE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) limit = min_t(unsigned long, limit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) limit <<= factor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) tcp_sk(sk)->tcp_tx_delay) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) * approximate our needs assuming an ~100% skb->truesize overhead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) * USEC_PER_SEC is approximated by 2^20.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) extra_bytes >>= (20 - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) limit += extra_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) if (refcount_read(&sk->sk_wmem_alloc) > limit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) /* Always send skb if rtx queue is empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) * No need to wait for TX completion to call us back,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) * after softirq/tasklet schedule.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) * This helps when TX completions are delayed too much.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) if (tcp_rtx_queue_empty(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) /* It is possible TX completion already happened
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) * before we set TSQ_THROTTLED, so we must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) * test again the condition.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) smp_mb__after_atomic();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) if (refcount_read(&sk->sk_wmem_alloc) > limit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) const u32 now = tcp_jiffies32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) enum tcp_chrono old = tp->chrono_type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) if (old > TCP_CHRONO_UNSPEC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) tp->chrono_stat[old - 1] += now - tp->chrono_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) tp->chrono_start = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) tp->chrono_type = new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) /* If there are multiple conditions worthy of tracking in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) * chronograph then the highest priority enum takes precedence
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) * over the other conditions. So that if something "more interesting"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) * starts happening, stop the previous chrono and start a new one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) if (type > tp->chrono_type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) tcp_chrono_set(tp, type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) /* There are multiple conditions worthy of tracking in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) * chronograph, so that the highest priority enum takes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) * precedence over the other conditions (see tcp_chrono_start).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) * If a condition stops, we only stop chrono tracking if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) * it's the "most interesting" or current chrono we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) * tracking and starts busy chrono if we have pending data.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) if (tcp_rtx_and_write_queues_empty(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) else if (type == tp->chrono_type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) tcp_chrono_set(tp, TCP_CHRONO_BUSY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) /* This routine writes packets to the network. It advances the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) * send_head. This happens as incoming acks open up the remote
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) * window for us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) * LARGESEND note: !tcp_urg_mode is overkill, only frames between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) * snd_up-64k-mss .. snd_up cannot be large. However, taking into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) * account rare use of URG, this is not a big flaw.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) * Send at most one packet when push_one > 0. Temporarily ignore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) * cwnd limit to force at most one packet out when push_one == 2.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) * Returns true, if no segments are in flight and we have queued segments,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) * but cannot send anything now because of SWS or another problem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) int push_one, gfp_t gfp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) struct sk_buff *skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) unsigned int tso_segs, sent_pkts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) int cwnd_quota;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) int result;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) bool is_cwnd_limited = false, is_rwnd_limited = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) u32 max_segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) sent_pkts = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) tcp_mstamp_refresh(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) if (!push_one) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) /* Do MTU probing. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) result = tcp_mtu_probe(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) if (!result) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) } else if (result > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) sent_pkts = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) max_segs = tcp_tso_segs(sk, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) while ((skb = tcp_send_head(sk))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) unsigned int limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) tcp_init_tso_segs(skb, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) goto repair; /* Skip network transmission */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) if (tcp_pacing_check(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) tso_segs = tcp_init_tso_segs(skb, mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) BUG_ON(!tso_segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) cwnd_quota = tcp_cwnd_test(tp, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) if (!cwnd_quota) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) if (push_one == 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) /* Force out a loss probe pkt. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) cwnd_quota = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) is_rwnd_limited = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) if (tso_segs == 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) (tcp_skb_is_last(sk, skb) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) nonagle : TCP_NAGLE_PUSH))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) if (!push_one &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) &is_rwnd_limited, max_segs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) limit = mss_now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) if (tso_segs > 1 && !tcp_urg_mode(tp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) limit = tcp_mss_split_point(sk, skb, mss_now,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) min_t(unsigned int,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) cwnd_quota,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) max_segs),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) nonagle);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) if (skb->len > limit &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) if (tcp_small_queue_check(sk, skb, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) /* Argh, we hit an empty skb(), presumably a thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) * is sleeping in sendmsg()/sk_stream_wait_memory().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) * We do not want to send a pure-ack packet and have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) * a strange looking rtx queue with empty packet(s).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) repair:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) /* Advance the send_head. This one is sent out.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) * This call will increment packets_out.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) tcp_event_new_data_sent(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) tcp_minshall_update(tp, mss_now, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) sent_pkts += tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) if (push_one)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) if (is_rwnd_limited)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) if (likely(sent_pkts || is_cwnd_limited))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) tcp_cwnd_validate(sk, is_cwnd_limited);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) if (likely(sent_pkts)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) if (tcp_in_cwnd_reduction(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) tp->prr_out += sent_pkts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) /* Send one loss probe per tail loss episode. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) if (push_one != 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) tcp_schedule_loss_probe(sk, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) return !tp->packets_out && !tcp_write_queue_empty(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) u32 timeout, rto_delta_us;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) int early_retrans;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) /* Don't do any loss probe on a Fast Open connection before 3WHS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) * finishes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) if (rcu_access_pointer(tp->fastopen_rsk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) /* Schedule a loss probe in 2*RTT for SACK capable connections
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) * not in loss recovery, that are either limited by cwnd or application.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) if ((early_retrans != 3 && early_retrans != 4) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) !tp->packets_out || !tcp_is_sack(tp) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) (icsk->icsk_ca_state != TCP_CA_Open &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) icsk->icsk_ca_state != TCP_CA_CWR))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) /* Probe timeout is 2*rtt. Add minimum RTO to account
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) * for delayed ack when there's one outstanding packet. If no RTT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) * sample is available then probe after TCP_TIMEOUT_INIT.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) if (tp->srtt_us) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) timeout = usecs_to_jiffies(tp->srtt_us >> 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) if (tp->packets_out == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) timeout += TCP_RTO_MIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) timeout += TCP_TIMEOUT_MIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) timeout = TCP_TIMEOUT_INIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) /* If the RTO formula yields an earlier time, then use that time. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) rto_delta_us = advancing_rto ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) tcp_rto_delta_us(sk); /* How far in future is RTO? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) if (rto_delta_us > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) /* Thanks to skb fast clones, we can detect if a prior transmit of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) * a packet is still in a qdisc or driver queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) * In this case, there is very little point doing a retransmit !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) static bool skb_still_in_host_queue(const struct sock *sk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) const struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) if (unlikely(skb_fclone_busy(sk, skb))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) NET_INC_STATS(sock_net(sk),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) /* When probe timeout (PTO) fires, try send a new segment if possible, else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) * retransmit the last segment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) void tcp_send_loss_probe(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) struct sk_buff *skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) int pcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) int mss = tcp_current_mss(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) /* At most one outstanding TLP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) if (tp->tlp_high_seq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) goto rearm_timer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) tp->tlp_retrans = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) skb = tcp_send_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) pcount = tp->packets_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) if (tp->packets_out > pcount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) goto probe_sent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) goto rearm_timer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) skb = skb_rb_last(&sk->tcp_rtx_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) if (unlikely(!skb)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) WARN_ONCE(tp->packets_out,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) "invalid inflight: %u state %u cwnd %u mss %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) inet_csk(sk)->icsk_pending = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) if (skb_still_in_host_queue(sk, skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) goto rearm_timer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) pcount = tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) if (WARN_ON(!pcount))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) goto rearm_timer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) (pcount - 1) * mss, mss,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) GFP_ATOMIC)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) goto rearm_timer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) skb = skb_rb_next(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) goto rearm_timer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) if (__tcp_retransmit_skb(sk, skb, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) goto rearm_timer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) tp->tlp_retrans = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) probe_sent:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) /* Record snd_nxt for loss detection. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) tp->tlp_high_seq = tp->snd_nxt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) /* Reset s.t. tcp_rearm_rto will restart timer from now */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) inet_csk(sk)->icsk_pending = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) rearm_timer:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) tcp_rearm_rto(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) /* Push out any pending frames which were held back due to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) * TCP_CORK or attempt at coalescing tiny packets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) * The socket must be locked by the caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) int nonagle)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) /* If we are closed, the bytes will have to remain here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) * In time closedown will finish, we empty the write queue and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) * all will be happy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) if (unlikely(sk->sk_state == TCP_CLOSE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) sk_gfp_mask(sk, GFP_ATOMIC)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) tcp_check_probe_timer(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) /* Send _single_ skb sitting at the send head. This function requires
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) * true push pending frames to setup probe timer etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) void tcp_push_one(struct sock *sk, unsigned int mss_now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) struct sk_buff *skb = tcp_send_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) BUG_ON(!skb || skb->len < mss_now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) /* This function returns the amount that we can raise the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) * usable window based on the following constraints
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) * 1. The window can never be shrunk once it is offered (RFC 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) * 2. We limit memory per socket
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) * RFC 1122:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) * "the suggested [SWS] avoidance algorithm for the receiver is to keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) * RECV.NEXT + RCV.WIN fixed until:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) * i.e. don't raise the right edge of the window until you can raise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) * it at least MSS bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) * Unfortunately, the recommended algorithm breaks header prediction,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) * since header prediction assumes th->window stays fixed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) * Strictly speaking, keeping th->window fixed violates the receiver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) * side SWS prevention criteria. The problem is that under this rule
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) * a stream of single byte packets will cause the right side of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) * window to always advance by a single byte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) * Of course, if the sender implements sender side SWS prevention
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) * then this will not be a problem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) * BSD seems to make the following compromise:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) * If the free space is less than the 1/4 of the maximum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) * space available and the free space is less than 1/2 mss,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) * then set the window to 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) * Otherwise, just prevent the window from shrinking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) * and from being larger than the largest representable value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) * This prevents incremental opening of the window in the regime
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) * where TCP is limited by the speed of the reader side taking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) * data out of the TCP receive queue. It does nothing about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) * those cases where the window is constrained on the sender side
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) * because the pipeline is full.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) * BSD also seems to "accidentally" limit itself to windows that are a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) * multiple of MSS, at least until the free space gets quite small.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) * This would appear to be a side effect of the mbuf implementation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) * Combining these two algorithms results in the observed behavior
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) * of having a fixed window size at almost all times.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) * Below we obtain similar behavior by forcing the offered window to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) * a multiple of the mss when it is feasible to do so.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) * Regular options like TIMESTAMP are taken into account.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) u32 __tcp_select_window(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) /* MSS for the peer's data. Previous versions used mss_clamp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) * here. I don't know if the value based on our guesses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) * of peer's MSS is better for the performance. It's more correct
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) * but may be worse for the performance because of rcv_mss
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) * fluctuations. --SAW 1998/11/1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) int mss = icsk->icsk_ack.rcv_mss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) int free_space = tcp_space(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) int allowed_space = tcp_full_space(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) int full_space, window;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) if (sk_is_mptcp(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) mptcp_space(sk, &free_space, &allowed_space);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) full_space = min_t(int, tp->window_clamp, allowed_space);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) if (unlikely(mss > full_space)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) mss = full_space;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) if (mss <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) if (free_space < (full_space >> 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) icsk->icsk_ack.quick = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) if (tcp_under_memory_pressure(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) tp->rcv_ssthresh = min(tp->rcv_ssthresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) 4U * tp->advmss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) /* free_space might become our new window, make sure we don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) * increase it due to wscale.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) /* if free space is less than mss estimate, or is below 1/16th
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) * of the maximum allowed, try to move to zero-window, else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) * new incoming data is dropped due to memory limits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) * With large window, mss test triggers way too late in order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) * to announce zero window in time before rmem limit kicks in.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) if (free_space < (allowed_space >> 4) || free_space < mss)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) if (free_space > tp->rcv_ssthresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) free_space = tp->rcv_ssthresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) /* Don't do rounding if we are using window scaling, since the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) * scaled window will not line up with the MSS boundary anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) if (tp->rx_opt.rcv_wscale) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) window = free_space;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) /* Advertise enough space so that it won't get scaled away.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) * Import case: prevent zero window announcement if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) * 1<<rcv_wscale > mss.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) window = tp->rcv_wnd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) /* Get the largest window that is a nice multiple of mss.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) * Window clamp already applied above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) * If our current window offering is within 1 mss of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) * free space we just keep it. This prevents the divide
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) * and multiply from happening most of the time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) * We also don't do any window rounding when the free space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) * is too small.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) if (window <= free_space - mss || window > free_space)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) window = rounddown(free_space, mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) else if (mss == full_space &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) free_space > window + (full_space >> 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) window = free_space;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) return window;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) void tcp_skb_collapse_tstamp(struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) const struct sk_buff *next_skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) if (unlikely(tcp_has_tx_tstamp(next_skb))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) const struct skb_shared_info *next_shinfo =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) skb_shinfo(next_skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) struct skb_shared_info *shinfo = skb_shinfo(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) shinfo->tskey = next_shinfo->tskey;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) TCP_SKB_CB(skb)->txstamp_ack |=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) TCP_SKB_CB(next_skb)->txstamp_ack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) /* Collapses two adjacent SKB's during retransmission. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033) static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) struct sk_buff *next_skb = skb_rb_next(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) int next_skb_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) next_skb_size = next_skb->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) if (next_skb_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) if (next_skb_size <= skb_availroom(skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) next_skb_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) tcp_highest_sack_replace(sk, next_skb, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) /* Update sequence range on original skb. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) /* Merge over control information. This moves PSH/FIN etc. over */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) /* All done, get rid of second SKB and account for it so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) * packet counting does not break.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) /* changed transmit queue under us so clear hints */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) tcp_clear_retrans_hints_partial(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) if (next_skb == tp->retransmit_skb_hint)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) tp->retransmit_skb_hint = skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) tcp_skb_collapse_tstamp(skb, next_skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) tcp_rtx_queue_unlink_and_free(next_skb, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077) /* Check if coalescing SKBs is legal. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) if (tcp_skb_pcount(skb) > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) if (skb_cloned(skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) /* Some heuristics for collapsing over SACK'd could be invented */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) /* Collapse packets in the retransmit queue to make to create
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) * less packets on the wire. This is only done on retransmission.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) int space)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) struct sk_buff *skb = to, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) bool first = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) skb_rbtree_walk_from_safe(skb, tmp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) if (!tcp_can_collapse(sk, skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) if (!tcp_skb_can_collapse(to, skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) space -= skb->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) if (first) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) first = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) if (space < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) if (!tcp_collapse_retrans(sk, to))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) /* This retransmits one SKB. Policy decisions and retransmit queue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) * state updates are done by the caller. Returns non-zero if an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) * error occurred which prevented the send.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) unsigned int cur_mss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) int diff, len, err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) /* Inconclusive MTU probe */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) if (icsk->icsk_mtup.probe_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) icsk->icsk_mtup.probe_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) /* Do not sent more than we queued. 1/4 is reserved for possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) * copying overhead: fragmentation, tunneling, mangling etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) if (refcount_read(&sk->sk_wmem_alloc) >
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) sk->sk_sndbuf))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) return -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) if (skb_still_in_host_queue(sk, skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) return -EHOSTUNREACH; /* Routing failure or similar. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) cur_mss = tcp_current_mss(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) /* If receiver has shrunk his window, and skb is out of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) * new window, do not retransmit it. The exception is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) * case, when window is shrunk to zero. In this case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175) * our retransmit serves as a zero window probe.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) TCP_SKB_CB(skb)->seq != tp->snd_una)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) return -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) len = cur_mss * segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) if (skb->len > len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) cur_mss, GFP_ATOMIC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) return -ENOMEM; /* We'll try again later. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) diff = tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) tcp_set_skb_tso_segs(skb, cur_mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) diff -= tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) if (diff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) tcp_adjust_pcount(sk, skb, diff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) if (skb->len < cur_mss)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) tcp_retrans_try_collapse(sk, skb, cur_mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) /* RFC3168, section 6.1.1.1. ECN fallback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) tcp_ecn_clear_syn(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) /* Update global and local TCP statistics. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) segs = tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) tp->total_retrans += segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) tp->bytes_retrans += skb->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) /* make sure skb->data is aligned on arches that require it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) * and check if ack-trimming & collapsing extended the headroom
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) * beyond what csum_start can cover.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) skb_headroom(skb) >= 0xFFFF)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) struct sk_buff *nskb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) tcp_skb_tsorted_save(skb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) if (nskb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) nskb->dev = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) err = -ENOBUFS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) } tcp_skb_tsorted_restore(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) tcp_rate_skb_sent(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) /* To avoid taking spuriously low RTT samples based on a timestamp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) * for a transmit that never happened, always mark EVER_RETRANS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) TCP_SKB_CB(skb)->seq, segs, err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) if (likely(!err)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) trace_tcp_retransmit_skb(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) } else if (err != -EBUSY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) int err = __tcp_retransmit_skb(sk, skb, segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) if (err == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) #if FASTRETRANS_DEBUG > 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) net_dbg_ratelimited("retrans_out leaked\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) tp->retrans_out += tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) /* Save stamp of the first (attempted) retransmit. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) if (!tp->retrans_stamp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) tp->retrans_stamp = tcp_skb_timestamp(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) if (tp->undo_retrans < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) tp->undo_retrans = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) tp->undo_retrans += tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) /* This gets called after a retransmit timeout, and the initially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) * retransmitted data is acknowledged. It tries to continue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) * resending the rest of the retransmit queue, until either
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) * we've sent it all or the congestion window limit is reached.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) void tcp_xmit_retransmit_queue(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) const struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) struct sk_buff *skb, *rtx_head, *hole = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) bool rearm_timer = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) u32 max_segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) int mib_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) if (!tp->packets_out)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) rtx_head = tcp_rtx_queue_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) skb = tp->retransmit_skb_hint ?: rtx_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) skb_rbtree_walk_from(skb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) __u8 sacked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) int segs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) if (tcp_pacing_check(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) /* we could do better than to assign each time */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) if (!hole)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) tp->retransmit_skb_hint = skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) if (segs <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) sacked = TCP_SKB_CB(skb)->sacked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) /* In case tcp_shift_skb_data() have aggregated large skbs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) * we need to make sure not sending too bigs TSO packets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) segs = min_t(int, segs, max_segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) if (tp->retrans_out >= tp->lost_out) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) } else if (!(sacked & TCPCB_LOST)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) hole = skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) if (icsk->icsk_ca_state != TCP_CA_Loss)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) mib_idx = LINUX_MIB_TCPFASTRETRANS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) if (tcp_small_queue_check(sk, skb, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) if (tcp_retransmit_skb(sk, skb, segs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) if (tcp_in_cwnd_reduction(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) tp->prr_out += tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) if (skb == rtx_head &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) rearm_timer = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) if (rearm_timer)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) inet_csk(sk)->icsk_rto,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) TCP_RTO_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) /* We allow to exceed memory limits for FIN packets to expedite
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) * connection tear down and (memory) recovery.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) * Otherwise tcp_send_fin() could be tempted to either delay FIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) * or even be forced to close flow without any FIN.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) * In general, we want to allow one skb per socket to avoid hangs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) * with edge trigger epoll()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) void sk_forced_mem_schedule(struct sock *sk, int size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) int amt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) if (size <= sk->sk_forward_alloc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) amt = sk_mem_pages(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) sk_memory_allocated_add(sk, amt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) if (mem_cgroup_sockets_enabled && sk->sk_memcg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) mem_cgroup_charge_skmem(sk->sk_memcg, amt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) /* Send a FIN. The caller locks the socket for us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) * We should try to send a FIN packet really hard, but eventually give up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) void tcp_send_fin(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) /* Optimization, tack on the FIN if we have one skb in write queue and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) * this skb was not yet sent, or we are under memory pressure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) * Note: in the latter case, FIN packet will be sent after a timeout,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) * as TCP stack thinks it has already been transmitted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) tskb = tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) if (!tskb && tcp_under_memory_pressure(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) tskb = skb_rb_last(&sk->tcp_rtx_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) if (tskb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) TCP_SKB_CB(tskb)->end_seq++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) tp->write_seq++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) if (!tail) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) /* This means tskb was already sent.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) * Pretend we included the FIN on previous transmit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) * We need to set tp->snd_nxt to the value it would have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) * if FIN had been sent. This is because retransmit path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) * does not change tp->snd_nxt.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) if (unlikely(!skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) skb_reserve(skb, MAX_TCP_HEADER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) sk_forced_mem_schedule(sk, skb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) tcp_init_nondata_skb(skb, tp->write_seq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) TCPHDR_ACK | TCPHDR_FIN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421) tcp_queue_skb(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426) /* We get here when a process closes a file descriptor (either due to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) * an explicit close() or as a byproduct of exit()'ing) and there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) * was unread data in the receive queue. This behavior is recommended
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) * by RFC 2525, section 2.17. -DaveM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) void tcp_send_active_reset(struct sock *sk, gfp_t priority)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) struct sk_buff *skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437) /* NOTE: No TCP options attached and we never retransmit this. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) skb = alloc_skb(MAX_TCP_HEADER, priority);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) if (!skb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) /* Reserve space for headers and prepare control bits. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) skb_reserve(skb, MAX_TCP_HEADER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447) TCPHDR_ACK | TCPHDR_RST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) tcp_mstamp_refresh(tcp_sk(sk));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) /* Send it off. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) if (tcp_transmit_skb(sk, skb, 0, priority))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) /* skb of trace_tcp_send_reset() keeps the skb that caused RST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) * skb here is different to the troublesome skb, so use NULL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) trace_tcp_send_reset(sk, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) /* Send a crossed SYN-ACK during socket establishment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) * WARNING: This routine must only be called when we have already sent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461) * a SYN packet that crossed the incoming SYN that caused this routine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462) * to get called. If this assumption fails then the initial rcv_wnd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463) * and rcv_wscale values will not be correct.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) int tcp_send_synack(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) struct sk_buff *skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) skb = tcp_rtx_queue_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) pr_err("%s: wrong queue state\n", __func__);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474) if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) if (skb_cloned(skb)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) struct sk_buff *nskb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478) tcp_skb_tsorted_save(skb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479) nskb = skb_copy(skb, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) } tcp_skb_tsorted_restore(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481) if (!nskb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484) tcp_highest_sack_replace(sk, skb, nskb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) tcp_rtx_queue_unlink_and_free(skb, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) __skb_header_release(nskb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487) tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488) sk_wmem_queued_add(sk, nskb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) sk_mem_charge(sk, nskb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) skb = nskb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) tcp_ecn_send_synack(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) * tcp_make_synack - Allocate one skb and build a SYNACK packet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501) * @sk: listener socket
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) * @dst: dst entry attached to the SYNACK. It is consumed and caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503) * should not use it again.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) * @req: request_sock pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505) * @foc: cookie for tcp fast open
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) * @synack_type: Type of synack to prepare
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) * @syn_skb: SYN packet just received. It could be NULL for rtx case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) struct request_sock *req,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511) struct tcp_fastopen_cookie *foc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) enum tcp_synack_type synack_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) struct sk_buff *syn_skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) struct inet_request_sock *ireq = inet_rsk(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) const struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) struct tcp_md5sig_key *md5 = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518) struct tcp_out_options opts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) struct sk_buff *skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) int tcp_header_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) struct tcphdr *th;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522) int mss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523) u64 now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) if (unlikely(!skb)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527) dst_release(dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) /* Reserve space for headers. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531) skb_reserve(skb, MAX_TCP_HEADER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) switch (synack_type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) case TCP_SYNACK_NORMAL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535) skb_set_owner_w(skb, req_to_sk(req));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) case TCP_SYNACK_COOKIE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) /* Under synflood, we do not attach skb to a socket,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539) * to avoid false sharing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) case TCP_SYNACK_FASTOPEN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) /* sk is a const pointer, because we want to express multiple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544) * cpu might call us concurrently.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) * sk->sk_wmem_alloc in an atomic, we can promote to rw.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) skb_set_owner_w(skb, (struct sock *)sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) skb_dst_set(skb, dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552) mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) memset(&opts, 0, sizeof(opts));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) now = tcp_clock_ns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556) #ifdef CONFIG_SYN_COOKIES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) skb->skb_mstamp_ns = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) #ifdef CONFIG_TCP_MD5SIG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) /* bpf program will be interested in the tcp_flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573) TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) foc, synack_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576) syn_skb) + sizeof(*th);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) skb_push(skb, tcp_header_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) skb_reset_transport_header(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581) th = (struct tcphdr *)skb->data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) memset(th, 0, sizeof(struct tcphdr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) th->syn = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584) th->ack = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585) tcp_ecn_make_synack(req, th);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) th->source = htons(ireq->ir_num);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) th->dest = ireq->ir_rmt_port;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) skb->mark = ireq->ir_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) skb->ip_summed = CHECKSUM_PARTIAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) th->seq = htonl(tcp_rsk(req)->snt_isn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) /* XXX data is queued and acked as is. No buffer/window check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592) th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) th->window = htons(min(req->rsk_rcv_wnd, 65535U));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596) tcp_options_write((__be32 *)(th + 1), NULL, &opts);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) th->doff = (tcp_header_size >> 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) #ifdef CONFIG_TCP_MD5SIG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) /* Okay, we have all we need - do the md5 hash if needed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) if (md5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) md5, req_to_sk(req), skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609) synack_type, &opts);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611) skb->skb_mstamp_ns = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) tcp_add_tx_delay(skb, tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) return skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) EXPORT_SYMBOL(tcp_make_synack);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) const struct tcp_congestion_ops *ca;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624) if (ca_key == TCP_CA_UNSPEC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) ca = tcp_ca_find_key(ca_key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) icsk->icsk_ca_ops = ca;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) /* Do all connect socket setups that can be done AF independent. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) static void tcp_connect_init(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) const struct dst_entry *dst = __sk_dst_get(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642) __u8 rcv_wscale;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) u32 rcv_wnd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) /* We'll fix this up when we get a response from the other end.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) tp->tcp_header_len = sizeof(struct tcphdr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649) if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) #ifdef CONFIG_TCP_MD5SIG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653) if (tp->af_specific->md5_lookup(sk, sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654) tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) /* If user gave his TCP_MAXSEG, record it to clamp */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) if (tp->rx_opt.user_mss)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) tp->max_window = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) tcp_mtup_init(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662) tcp_sync_mss(sk, dst_mtu(dst));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) tcp_ca_dst_init(sk, dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) if (!tp->window_clamp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668) tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) tcp_initialize_rcv_mss(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) /* limit the window selection if the user enforce a smaller rx buffer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) tp->window_clamp = tcp_full_space(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) rcv_wnd = tcp_rwnd_init_bpf(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) if (rcv_wnd == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) rcv_wnd = dst_metric(dst, RTAX_INITRWND);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) tcp_select_initial_window(sk, tcp_full_space(sk),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683) &tp->rcv_wnd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) &tp->window_clamp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685) sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) &rcv_wscale,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) rcv_wnd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) tp->rx_opt.rcv_wscale = rcv_wscale;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) tp->rcv_ssthresh = tp->rcv_wnd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) sk->sk_err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693) sock_reset_flag(sk, SOCK_DONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) tp->snd_wnd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) tcp_init_wl(tp, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696) tcp_write_queue_purge(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) tp->snd_una = tp->write_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) tp->snd_sml = tp->write_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699) tp->snd_up = tp->write_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) WRITE_ONCE(tp->snd_nxt, tp->write_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) if (likely(!tp->repair))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) tp->rcv_nxt = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) tp->rcv_tstamp = tcp_jiffies32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706) tp->rcv_wup = tp->rcv_nxt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707) WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710) inet_csk(sk)->icsk_retransmits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) tcp_clear_retrans(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719) tcb->end_seq += skb->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) __skb_header_release(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721) sk_wmem_queued_add(sk, skb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722) sk_mem_charge(sk, skb->truesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723) WRITE_ONCE(tp->write_seq, tcb->end_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724) tp->packets_out += tcp_skb_pcount(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727) /* Build and send a SYN with data and (cached) Fast Open cookie. However,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728) * queue a data-only packet after the regular SYN, such that regular SYNs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729) * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) * only the SYN sequence, the data are retransmitted in the first ACK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731) * If cookie is not cached or other error occurs, falls back to send a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732) * regular SYN with Fast Open cookie request option.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) struct tcp_fastopen_request *fo = tp->fastopen_req;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739) int space, err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740) struct sk_buff *syn_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742) tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747) * user-MSS. Reserve maximum option space for middleboxes that add
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748) * private TCP options. The cost is reduced data space in SYN :(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) /* Sync mss_cache after updating the mss_clamp */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755) MAX_TCP_OPTION_SPACE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) space = min_t(size_t, space, fo->size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) /* limit to order-0 allocations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762) syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763) if (!syn_data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) syn_data->ip_summed = CHECKSUM_PARTIAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766) memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) if (space) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) int copied = copy_from_iter(skb_put(syn_data, space), space,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769) &fo->data->msg_iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770) if (unlikely(!copied)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) tcp_skb_tsorted_anchor_cleanup(syn_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) kfree_skb(syn_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775) if (copied != space) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) skb_trim(syn_data, copied);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777) space = copied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779) skb_zcopy_set(syn_data, fo->uarg, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) /* No more data pending in inet_wait_for_connect() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) if (space == fo->size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783) fo->data = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) fo->copied = space;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786) tcp_connect_queue_skb(sk, syn_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787) if (syn_data->len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) tcp_chrono_start(sk, TCP_CHRONO_BUSY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792) syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794) /* Now full SYN+DATA was cloned and sent (or not),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) * remove the SYN from the original skb (syn_data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796) * we keep in write queue in case of a retransmit, as we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797) * also have the SYN packet (with no data) in the same queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) TCP_SKB_CB(syn_data)->seq++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801) if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) tp->syn_data = (fo->copied > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803) tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808) /* data was not sent, put it in write_queue */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) __skb_queue_tail(&sk->sk_write_queue, syn_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810) tp->packets_out -= tcp_skb_pcount(syn_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) fallback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813) /* Send a regular SYN with Fast Open cookie request option */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) if (fo->cookie.len > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815) fo->cookie.len = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816) err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818) tp->syn_fastopen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820) fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824) /* Build a SYN and send it off. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825) int tcp_connect(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828) struct sk_buff *buff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831) tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833) if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) return -EHOSTUNREACH; /* Routing failure or similar. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836) tcp_connect_init(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838) if (unlikely(tp->repair)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839) tcp_finish_connect(sk, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843) buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844) if (unlikely(!buff))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845) return -ENOBUFS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) tcp_mstamp_refresh(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849) tp->retrans_stamp = tcp_time_stamp(tp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) tcp_connect_queue_skb(sk, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851) tcp_ecn_send_syn(sk, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854) /* Send off SYN; include data in Fast Open. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857) if (err == -ECONNREFUSED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860) /* We change tp->snd_nxt after the tcp_transmit_skb() call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) * in order to make this packet get counted in tcpOutSegs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863) WRITE_ONCE(tp->snd_nxt, tp->write_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864) tp->pushed_seq = tp->write_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) buff = tcp_send_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866) if (unlikely(buff)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867) WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) tp->pushed_seq = TCP_SKB_CB(buff)->seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870) TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) /* Timer for repeating the SYN until an answer. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) EXPORT_SYMBOL(tcp_connect);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) /* Send out a delayed ack, the caller does the policy checking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) * for details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883) void tcp_send_delayed_ack(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) int ato = icsk->icsk_ack.ato;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887) unsigned long timeout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) if (ato > TCP_DELACK_MIN) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) const struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) int max_ato = HZ / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893) if (inet_csk_in_pingpong_mode(sk) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895) max_ato = TCP_DELACK_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) /* Slow path, intersegment interval is "high". */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) /* If some rtt estimate is known, use it to bound delayed ack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900) * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) * directly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903) if (tp->srtt_us) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904) int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905) TCP_DELACK_MIN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) if (rtt < max_ato)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) max_ato = rtt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) ato = min(ato, max_ato);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916) /* Stay within the limit we were given */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917) timeout = jiffies + ato;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919) /* Use new timeout only if there wasn't a older one earlier. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921) /* If delack timer is about to expire, send ACK now. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922) if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) tcp_send_ack(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) if (!time_before(timeout, icsk->icsk_ack.timeout))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) timeout = icsk->icsk_ack.timeout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931) icsk->icsk_ack.timeout = timeout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) /* This routine sends an ack and also updates the window. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936) void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) struct sk_buff *buff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) /* If we have been reset, we may not send again. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941) if (sk->sk_state == TCP_CLOSE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944) /* We are not putting this on the write queue, so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945) * tcp_transmit_skb() will set the ownership to this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) * sock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948) buff = alloc_skb(MAX_TCP_HEADER,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950) if (unlikely(!buff)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) unsigned long delay;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) if (delay < TCP_RTO_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956) icsk->icsk_ack.retry++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) inet_csk_schedule_ack(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) icsk->icsk_ack.ato = TCP_ATO_MIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) /* Reserve space for headers and prepare control bits. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964) skb_reserve(buff, MAX_TCP_HEADER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965) tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967) /* We do not want pure acks influencing TCP Small Queues or fq/pacing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968) * too much.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) skb_set_tcp_pure_ack(buff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973) /* Send it off, this clears delayed acks for us. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974) __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976) EXPORT_SYMBOL_GPL(__tcp_send_ack);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) void tcp_send_ack(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983) /* This routine sends a packet with an out of date sequence
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) * number. It assumes the other end will try to ack it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) * Question: what should we make while urgent mode?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) * 4.4BSD forces sending single byte of data. We cannot send
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988) * out of window data, because we have SND.NXT==SND.MAX...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990) * Current solution: to send TWO zero-length segments in urgent mode:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992) * out-of-date with SND.UNA-1 to probe window.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) struct sk_buff *skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999) /* We don't queue it, tcp_transmit_skb() sets ownership. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) skb = alloc_skb(MAX_TCP_HEADER,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001) sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) if (!skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) /* Reserve space for headers and set control bits. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006) skb_reserve(skb, MAX_TCP_HEADER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007) /* Use a previous sequence. This should cause the other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) * end to send an ack. Don't queue or clone SKB, just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009) * send it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011) tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) NET_INC_STATS(sock_net(sk), mib);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013) return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016) /* Called from setsockopt( ... TCP_REPAIR ) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) void tcp_send_window_probe(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) if (sk->sk_state == TCP_ESTABLISHED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021) tcp_mstamp_refresh(tcp_sk(sk));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026) /* Initiate keepalive or window probe from timer. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027) int tcp_write_wakeup(struct sock *sk, int mib)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030) struct sk_buff *skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032) if (sk->sk_state == TCP_CLOSE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035) skb = tcp_send_head(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) unsigned int mss = tcp_current_mss(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041) if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) /* We are probing the opening of a window
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) * but the window size is != 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046) * must have been a result SWS avoidance ( sender )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048) if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) skb->len > mss) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050) seg_size = min(seg_size, mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051) TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) skb, seg_size, mss, GFP_ATOMIC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) } else if (!tcp_skb_pcount(skb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) tcp_set_skb_tso_segs(skb, mss);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060) if (!err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061) tcp_event_new_data_sent(sk, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064) if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065) tcp_xmit_probe_skb(sk, 1, mib);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066) return tcp_xmit_probe_skb(sk, 0, mib);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070) /* A window probe timeout has occurred. If window is not closed send
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) * a partial packet else a zero probe.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) void tcp_send_probe0(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075) struct inet_connection_sock *icsk = inet_csk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) struct tcp_sock *tp = tcp_sk(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) struct net *net = sock_net(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078) unsigned long timeout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081) err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) if (tp->packets_out || tcp_write_queue_empty(sk)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) /* Cancel probe timer, if it is not required. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) icsk->icsk_probes_out = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086) icsk->icsk_backoff = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087) icsk->icsk_probes_tstamp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091) icsk->icsk_probes_out++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) if (err <= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093) if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094) icsk->icsk_backoff++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097) /* If packet was not sent due to local congestion,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098) * Let senders fight for local resources conservatively.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100) timeout = TCP_RESOURCE_PROBE_INTERVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103) timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109) const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) struct flowi fl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) int res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) tcp_rsk(req)->txhash = net_tx_rndhash();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114) res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115) NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) if (!res) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) if (unlikely(tcp_passive_fastopen(sk)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120) tcp_sk(sk)->total_retrans++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121) trace_tcp_retransmit_synack(sk, req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123) return res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125) EXPORT_SYMBOL(tcp_rtx_synack);