^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-or-later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Meant to be mostly used for locally generated traffic :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Fast classification depends on skb->sk being set before reaching us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * All packets belonging to a socket are considered as a 'flow'.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Flows are dynamically allocated and stored in a hash table of RB trees
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * They are also part of one Round Robin 'queues' (new or old flows)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * Burst avoidance (aka pacing) capability :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * bunch of packets, and this packet scheduler adds delay between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * packets to respect rate limitation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * enqueue() :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * - lookup one RB tree (out of 1024 or more) to find the flow.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * If non existent flow, create it, add it to the tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * Add skb to the per flow list of skb (fifo).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * - Use a special fifo for high prio packets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * dequeue() : serves flows in Round Robin
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * Note : When a flow becomes empty, we do not immediately remove it from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * rb trees, for performance reasons (its expected to send additional packets,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * or SLAB cache will reuse socket for another flow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/jiffies.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/in.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/skbuff.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/rbtree.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/hash.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #include <linux/prefetch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <linux/vmalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #include <net/netlink.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <net/pkt_sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #include <net/sock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include <net/tcp_states.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #include <net/tcp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) struct fq_skb_cb {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) u64 time_to_send;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * Per flow structure, dynamically allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * If packets have monotically increasing time_to_send, they are placed in O(1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) struct fq_flow {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) struct rb_root t_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) struct sk_buff *head; /* list of skbs for this flow : first skb */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) union {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) struct sk_buff *tail; /* last skb in the list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) struct rb_node fq_node; /* anchor in fq_root[] trees */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) struct sock *sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) u32 socket_hash; /* sk_hash */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) int qlen; /* number of packets in flow queue */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) /* Second cache line, used in fq_dequeue() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) int credit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) /* 32bit hole on 64bit arches */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) struct fq_flow *next; /* next pointer in RR lists */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) struct rb_node rate_node; /* anchor in q->delayed tree */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) u64 time_next_packet;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) } ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) struct fq_flow_head {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) struct fq_flow *first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) struct fq_flow *last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) struct fq_sched_data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) struct fq_flow_head new_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) struct fq_flow_head old_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) struct rb_root delayed; /* for rate limited flows */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) u64 time_next_delayed_flow;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) u64 ktime_cache; /* copy of last ktime_get_ns() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) unsigned long unthrottle_latency_ns;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) struct fq_flow internal; /* for non classified or high prio packets */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) u32 quantum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) u32 initial_quantum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) u32 flow_refill_delay;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) u32 flow_plimit; /* max packets per flow */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) unsigned long flow_max_rate; /* optional max rate per flow */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) u64 ce_threshold;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) u64 horizon; /* horizon in ns */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) u32 orphan_mask; /* mask for orphaned skb */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) u32 low_rate_threshold;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) struct rb_root *fq_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) u8 rate_enable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) u8 fq_trees_log;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) u8 horizon_drop;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) u32 flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) u32 inactive_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) u32 throttled_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) u64 stat_gc_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) u64 stat_internal_packets;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) u64 stat_throttled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) u64 stat_ce_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) u64 stat_horizon_drops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) u64 stat_horizon_caps;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) u64 stat_flows_plimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) u64 stat_pkts_too_long;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) u64 stat_allocation_errors;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) u32 timer_slack; /* hrtimer slack in ns */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) struct qdisc_watchdog watchdog;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * f->tail and f->age share the same location.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) * We can use the low order bit to differentiate if this location points
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) * to a sk_buff or contains a jiffies value, if we force this value to be odd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) static void fq_flow_set_detached(struct fq_flow *f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) f->age = jiffies | 1UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) static bool fq_flow_is_detached(const struct fq_flow *f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) return !!(f->age & 1UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) /* special value to mark a throttled flow (not on old/new list) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) static struct fq_flow throttled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) static bool fq_flow_is_throttled(const struct fq_flow *f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) return f->next == &throttled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) if (head->first)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) head->last->next = flow;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) head->first = flow;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) head->last = flow;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) flow->next = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) rb_erase(&f->rate_node, &q->delayed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) q->throttled_flows--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) fq_flow_add_tail(&q->old_flows, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) while (*p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) struct fq_flow *aux;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) parent = *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) aux = rb_entry(parent, struct fq_flow, rate_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) if (f->time_next_packet >= aux->time_next_packet)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) p = &parent->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) p = &parent->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) rb_link_node(&f->rate_node, parent, p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) rb_insert_color(&f->rate_node, &q->delayed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) q->throttled_flows++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) q->stat_throttled++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) f->next = &throttled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) if (q->time_next_delayed_flow > f->time_next_packet)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) q->time_next_delayed_flow = f->time_next_packet;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) static struct kmem_cache *fq_flow_cachep __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) /* limit number of collected flows per round */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) #define FQ_GC_MAX 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) #define FQ_GC_AGE (3*HZ)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) static bool fq_gc_candidate(const struct fq_flow *f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) return fq_flow_is_detached(f) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) time_after(jiffies, f->age + FQ_GC_AGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) static void fq_gc(struct fq_sched_data *q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) struct rb_root *root,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) struct rb_node **p, *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) void *tofree[FQ_GC_MAX];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) struct fq_flow *f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) int i, fcnt = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) p = &root->rb_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) parent = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) while (*p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) parent = *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) f = rb_entry(parent, struct fq_flow, fq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) if (f->sk == sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) if (fq_gc_candidate(f)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) tofree[fcnt++] = f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) if (fcnt == FQ_GC_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) if (f->sk > sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) p = &parent->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) p = &parent->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) if (!fcnt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) for (i = fcnt; i > 0; ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) f = tofree[--i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) rb_erase(&f->fq_node, root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) q->flows -= fcnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) q->inactive_flows -= fcnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) q->stat_gc_flows += fcnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) struct rb_node **p, *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) struct sock *sk = skb->sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) struct rb_root *root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) struct fq_flow *f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) /* warning: no starvation prevention... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) return &q->internal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) * or a listener (SYNCOOKIE mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) * 1) request sockets are not full blown,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) * they do not contain sk_pacing_rate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) * 2) They are not part of a 'flow' yet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) * 3) We do not want to rate limit them (eg SYNFLOOD attack),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * especially if the listener set SO_MAX_PACING_RATE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) * 4) We pretend they are orphaned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) if (!sk || sk_listener(sk)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) /* By forcing low order bit to 1, we make sure to not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) * collide with a local flow (socket pointers are word aligned)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) sk = (struct sock *)((hash << 1) | 1UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) skb_orphan(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) } else if (sk->sk_state == TCP_CLOSE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) * Sockets in TCP_CLOSE are non connected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) * Typical use case is UDP sockets, they can send packets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) * with sendto() to many different destinations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) * We probably could use a generic bit advertising
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) * non connected sockets, instead of sk_state == TCP_CLOSE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * if we care enough.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) sk = (struct sock *)((hash << 1) | 1UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) if (q->flows >= (2U << q->fq_trees_log) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) q->inactive_flows > q->flows/2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) fq_gc(q, root, sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) p = &root->rb_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) parent = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) while (*p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) parent = *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) f = rb_entry(parent, struct fq_flow, fq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) if (f->sk == sk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) /* socket might have been reallocated, so check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) * if its sk_hash is the same.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) * It not, we need to refill credit with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) * initial quantum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) if (unlikely(skb->sk == sk &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) f->socket_hash != sk->sk_hash)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) f->credit = q->initial_quantum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) f->socket_hash = sk->sk_hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) if (q->rate_enable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) smp_store_release(&sk->sk_pacing_status,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) SK_PACING_FQ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) if (fq_flow_is_throttled(f))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) fq_flow_unset_throttled(q, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) f->time_next_packet = 0ULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) return f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) if (f->sk > sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) p = &parent->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) p = &parent->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) if (unlikely(!f)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) q->stat_allocation_errors++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) return &q->internal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) /* f->t_root is already zeroed after kmem_cache_zalloc() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) fq_flow_set_detached(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) f->sk = sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) if (skb->sk == sk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) f->socket_hash = sk->sk_hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) if (q->rate_enable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) smp_store_release(&sk->sk_pacing_status,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) SK_PACING_FQ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) f->credit = q->initial_quantum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) rb_link_node(&f->fq_node, parent, p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) rb_insert_color(&f->fq_node, root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) q->flows++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) q->inactive_flows++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) return f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) static struct sk_buff *fq_peek(struct fq_flow *flow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) struct sk_buff *skb = skb_rb_first(&flow->t_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) struct sk_buff *head = flow->head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) if (!skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) return head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) if (!head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) return skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) return skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) return head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) if (skb == flow->head) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) flow->head = skb->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) rb_erase(&skb->rbnode, &flow->t_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) skb->dev = qdisc_dev(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) /* Remove one skb from flow queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) * This skb must be the return value of prior fq_peek().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) fq_erase_head(sch, flow, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) skb_mark_not_on_list(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) flow->qlen--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) qdisc_qstats_backlog_dec(sch, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) sch->q.qlen--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) struct rb_node **p, *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) struct sk_buff *head, *aux;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) head = flow->head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) if (!head ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) if (!head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) flow->head = skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) flow->tail->next = skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) flow->tail = skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) skb->next = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) p = &flow->t_root.rb_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) parent = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) while (*p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) parent = *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) aux = rb_to_skb(parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) p = &parent->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) p = &parent->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) rb_link_node(&skb->rbnode, parent, p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) rb_insert_color(&skb->rbnode, &flow->t_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) const struct fq_sched_data *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) struct sk_buff **to_free)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) struct fq_flow *f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) if (unlikely(sch->q.qlen >= sch->limit))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) return qdisc_drop(skb, sch, to_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) if (!skb->tstamp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) /* Check if packet timestamp is too far in the future.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * Try first if our cached value, to avoid ktime_get_ns()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * cost in most cases.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) if (fq_packet_beyond_horizon(skb, q)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) /* Refresh our cache and check another time */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) q->ktime_cache = ktime_get_ns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) if (fq_packet_beyond_horizon(skb, q)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) if (q->horizon_drop) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) q->stat_horizon_drops++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) return qdisc_drop(skb, sch, to_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) q->stat_horizon_caps++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) skb->tstamp = q->ktime_cache + q->horizon;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) fq_skb_cb(skb)->time_to_send = skb->tstamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) f = fq_classify(skb, q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) q->stat_flows_plimit++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) return qdisc_drop(skb, sch, to_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) f->qlen++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) qdisc_qstats_backlog_inc(sch, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) if (fq_flow_is_detached(f)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) fq_flow_add_tail(&q->new_flows, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) if (time_after(jiffies, f->age + q->flow_refill_delay))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) f->credit = max_t(u32, f->credit, q->quantum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) q->inactive_flows--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) /* Note: this overwrites f->age */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) flow_queue_add(f, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) if (unlikely(f == &q->internal)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) q->stat_internal_packets++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) sch->q.qlen++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) return NET_XMIT_SUCCESS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) static void fq_check_throttled(struct fq_sched_data *q, u64 now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) unsigned long sample;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) struct rb_node *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) if (q->time_next_delayed_flow > now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) /* Update unthrottle latency EWMA.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * This is cheap and can help diagnosing timer/latency problems.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) sample = (unsigned long)(now - q->time_next_delayed_flow);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) q->unthrottle_latency_ns += sample >> 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) q->time_next_delayed_flow = ~0ULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) while ((p = rb_first(&q->delayed)) != NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) if (f->time_next_packet > now) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) q->time_next_delayed_flow = f->time_next_packet;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) fq_flow_unset_throttled(q, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) static struct sk_buff *fq_dequeue(struct Qdisc *sch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) struct fq_flow_head *head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) struct sk_buff *skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) struct fq_flow *f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) unsigned long rate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) u32 plen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) u64 now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) if (!sch->q.qlen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) skb = fq_peek(&q->internal);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) if (unlikely(skb)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) fq_dequeue_skb(sch, &q->internal, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) q->ktime_cache = now = ktime_get_ns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) fq_check_throttled(q, now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) begin:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) head = &q->new_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) if (!head->first) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) head = &q->old_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) if (!head->first) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) if (q->time_next_delayed_flow != ~0ULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) qdisc_watchdog_schedule_range_ns(&q->watchdog,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) q->time_next_delayed_flow,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) q->timer_slack);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) f = head->first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) if (f->credit <= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) f->credit += q->quantum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) head->first = f->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) fq_flow_add_tail(&q->old_flows, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) goto begin;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) skb = fq_peek(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) if (skb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) f->time_next_packet);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) if (now < time_next_packet) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) head->first = f->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) f->time_next_packet = time_next_packet;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) fq_flow_set_throttled(q, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) goto begin;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) prefetch(&skb->end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) INET_ECN_set_ce(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) q->stat_ce_mark++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) fq_dequeue_skb(sch, f, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) head->first = f->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) /* force a pass through old_flows to prevent starvation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) if ((head == &q->new_flows) && q->old_flows.first) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) fq_flow_add_tail(&q->old_flows, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) fq_flow_set_detached(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) q->inactive_flows++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) goto begin;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) plen = qdisc_pkt_len(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) f->credit -= plen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) if (!q->rate_enable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) rate = q->flow_max_rate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /* If EDT time was provided for this skb, we need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) * update f->time_next_packet only if this qdisc enforces
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) * a flow max rate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) if (!skb->tstamp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) if (skb->sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) rate = min(skb->sk->sk_pacing_rate, rate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) if (rate <= q->low_rate_threshold) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) f->credit = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) plen = max(plen, q->quantum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) if (f->credit > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) if (rate != ~0UL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) u64 len = (u64)plen * NSEC_PER_SEC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) if (likely(rate))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) len = div64_ul(len, rate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) /* Since socket rate can change later,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) * clamp the delay to 1 second.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) * Really, providers of too big packets should be fixed !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) if (unlikely(len > NSEC_PER_SEC)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) len = NSEC_PER_SEC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) q->stat_pkts_too_long++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) /* Account for schedule/timers drifts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) * f->time_next_packet was set when prior packet was sent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) * and current time (@now) can be too late by tens of us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) if (f->time_next_packet)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) len -= min(len/2, now - f->time_next_packet);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) f->time_next_packet = now + len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) qdisc_bstats_update(sch, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) return skb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) static void fq_flow_purge(struct fq_flow *flow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) struct rb_node *p = rb_first(&flow->t_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) while (p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) struct sk_buff *skb = rb_to_skb(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) p = rb_next(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) rb_erase(&skb->rbnode, &flow->t_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) rtnl_kfree_skbs(skb, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) rtnl_kfree_skbs(flow->head, flow->tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) flow->head = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) flow->qlen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) static void fq_reset(struct Qdisc *sch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) struct rb_root *root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) struct rb_node *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) struct fq_flow *f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) unsigned int idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) sch->q.qlen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) sch->qstats.backlog = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) fq_flow_purge(&q->internal);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) if (!q->fq_root)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) root = &q->fq_root[idx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) while ((p = rb_first(root)) != NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) f = rb_entry(p, struct fq_flow, fq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) rb_erase(p, root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) fq_flow_purge(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) kmem_cache_free(fq_flow_cachep, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) q->new_flows.first = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) q->old_flows.first = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) q->delayed = RB_ROOT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) q->flows = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) q->inactive_flows = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) q->throttled_flows = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) static void fq_rehash(struct fq_sched_data *q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) struct rb_root *old_array, u32 old_log,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) struct rb_root *new_array, u32 new_log)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) struct rb_node *op, **np, *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) struct rb_root *oroot, *nroot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) struct fq_flow *of, *nf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) int fcnt = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) u32 idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) for (idx = 0; idx < (1U << old_log); idx++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) oroot = &old_array[idx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) while ((op = rb_first(oroot)) != NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) rb_erase(op, oroot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) of = rb_entry(op, struct fq_flow, fq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) if (fq_gc_candidate(of)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) fcnt++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) kmem_cache_free(fq_flow_cachep, of);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) nroot = &new_array[hash_ptr(of->sk, new_log)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) np = &nroot->rb_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) parent = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) while (*np) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) parent = *np;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) nf = rb_entry(parent, struct fq_flow, fq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) BUG_ON(nf->sk == of->sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) if (nf->sk > of->sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) np = &parent->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) np = &parent->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) rb_link_node(&of->fq_node, parent, np);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) rb_insert_color(&of->fq_node, nroot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) q->flows -= fcnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) q->inactive_flows -= fcnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) q->stat_gc_flows += fcnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) static void fq_free(void *addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) kvfree(addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) static int fq_resize(struct Qdisc *sch, u32 log)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) struct rb_root *array;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) void *old_fq_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) u32 idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) if (q->fq_root && log == q->fq_trees_log)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) /* If XPS was setup, we can allocate memory on right NUMA node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) netdev_queue_numa_node_read(sch->dev_queue));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) if (!array)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) for (idx = 0; idx < (1U << log); idx++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) array[idx] = RB_ROOT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) sch_tree_lock(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) old_fq_root = q->fq_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) if (old_fq_root)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) q->fq_root = array;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) q->fq_trees_log = log;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) sch_tree_unlock(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) fq_free(old_fq_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) [TCA_FQ_HORIZON] = { .type = NLA_U32 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) static int fq_change(struct Qdisc *sch, struct nlattr *opt,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) struct netlink_ext_ack *extack)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) struct nlattr *tb[TCA_FQ_MAX + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) int err, drop_count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) unsigned drop_len = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) u32 fq_log;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) if (!opt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) if (err < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) sch_tree_lock(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) fq_log = q->fq_trees_log;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) if (tb[TCA_FQ_BUCKETS_LOG]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) if (nval >= 1 && nval <= ilog2(256*1024))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) fq_log = nval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) if (tb[TCA_FQ_PLIMIT])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) if (tb[TCA_FQ_FLOW_PLIMIT])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) if (tb[TCA_FQ_QUANTUM]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) if (quantum > 0 && quantum <= (1 << 20)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) q->quantum = quantum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) if (tb[TCA_FQ_INITIAL_QUANTUM])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) if (tb[TCA_FQ_FLOW_MAX_RATE]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) q->low_rate_threshold =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) if (tb[TCA_FQ_RATE_ENABLE]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) if (enable <= 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) q->rate_enable = enable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) if (tb[TCA_FQ_ORPHAN_MASK])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) if (tb[TCA_FQ_CE_THRESHOLD])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) q->ce_threshold = (u64)NSEC_PER_USEC *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) if (tb[TCA_FQ_TIMER_SLACK])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) if (tb[TCA_FQ_HORIZON])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) q->horizon = (u64)NSEC_PER_USEC *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) nla_get_u32(tb[TCA_FQ_HORIZON]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) if (tb[TCA_FQ_HORIZON_DROP])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) sch_tree_unlock(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) err = fq_resize(sch, fq_log);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) sch_tree_lock(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) while (sch->q.qlen > sch->limit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) struct sk_buff *skb = fq_dequeue(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) if (!skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) drop_len += qdisc_pkt_len(skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) rtnl_kfree_skbs(skb, skb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) drop_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) sch_tree_unlock(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) static void fq_destroy(struct Qdisc *sch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) fq_reset(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) fq_free(q->fq_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) qdisc_watchdog_cancel(&q->watchdog);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) static int fq_init(struct Qdisc *sch, struct nlattr *opt,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) struct netlink_ext_ack *extack)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) sch->limit = 10000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) q->flow_plimit = 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) q->quantum = 2 * psched_mtu(qdisc_dev(sch));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) q->flow_refill_delay = msecs_to_jiffies(40);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) q->flow_max_rate = ~0UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) q->time_next_delayed_flow = ~0ULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) q->rate_enable = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) q->new_flows.first = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) q->old_flows.first = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) q->delayed = RB_ROOT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) q->fq_root = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) q->fq_trees_log = ilog2(1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) q->orphan_mask = 1024 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) q->low_rate_threshold = 550000 / 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) q->horizon_drop = 1; /* by default, drop packets beyond horizon */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) /* Default ce_threshold of 4294 seconds */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) if (opt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) err = fq_change(sch, opt, extack);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) err = fq_resize(sch, q->fq_trees_log);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) u64 ce_threshold = q->ce_threshold;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) u64 horizon = q->horizon;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) struct nlattr *opts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) if (opts == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) goto nla_put_failure;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) do_div(ce_threshold, NSEC_PER_USEC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) do_div(horizon, NSEC_PER_USEC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) min_t(unsigned long, q->flow_max_rate, ~0U)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) jiffies_to_usecs(q->flow_refill_delay)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) q->low_rate_threshold) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) goto nla_put_failure;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) return nla_nest_end(skb, opts);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) nla_put_failure:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) struct fq_sched_data *q = qdisc_priv(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) struct tc_fq_qd_stats st;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) sch_tree_lock(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) st.gc_flows = q->stat_gc_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) st.highprio_packets = q->stat_internal_packets;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) st.tcp_retrans = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) st.throttled = q->stat_throttled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) st.flows_plimit = q->stat_flows_plimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) st.pkts_too_long = q->stat_pkts_too_long;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) st.allocation_errors = q->stat_allocation_errors;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) ktime_get_ns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) st.flows = q->flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) st.inactive_flows = q->inactive_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) st.throttled_flows = q->throttled_flows;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) st.unthrottle_latency_ns = min_t(unsigned long,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) q->unthrottle_latency_ns, ~0U);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) st.ce_mark = q->stat_ce_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) st.horizon_drops = q->stat_horizon_drops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) st.horizon_caps = q->stat_horizon_caps;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) sch_tree_unlock(sch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) return gnet_stats_copy_app(d, &st, sizeof(st));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) .id = "fq",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) .priv_size = sizeof(struct fq_sched_data),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) .enqueue = fq_enqueue,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) .dequeue = fq_dequeue,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) .peek = qdisc_peek_dequeued,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) .init = fq_init,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) .reset = fq_reset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) .destroy = fq_destroy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) .change = fq_change,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) .dump = fq_dump,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) .dump_stats = fq_dump_stats,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) .owner = THIS_MODULE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) static int __init fq_module_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) fq_flow_cachep = kmem_cache_create("fq_flow_cache",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) sizeof(struct fq_flow),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) 0, 0, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) if (!fq_flow_cachep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) ret = register_qdisc(&fq_qdisc_ops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) kmem_cache_destroy(fq_flow_cachep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) static void __exit fq_module_exit(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) unregister_qdisc(&fq_qdisc_ops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) kmem_cache_destroy(fq_flow_cachep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) module_init(fq_module_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) module_exit(fq_module_exit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) MODULE_AUTHOR("Eric Dumazet");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) MODULE_LICENSE("GPL");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) MODULE_DESCRIPTION("Fair Queue Packet Scheduler");