^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /* Copyright (c) 2019 Facebook
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * This program is free software; you can redistribute it and/or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * modify it under the terms of version 2 of the GNU General Public
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * License as published by the Free Software Foundation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Sample Host Bandwidth Manager (HBM) BPF program.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * A cgroup skb BPF egress program to limit cgroup output bandwidth.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * It uses a modified virtual token bucket queue to limit average
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * egress bandwidth. The implementation uses credits instead of tokens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * Negative credits imply that queueing would have happened (this is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * a virtual queue, so no queueing is done by it. However, queueing may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * occur at the actual qdisc (which is not used for rate limiting).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * This implementation uses 3 thresholds, one to start marking packets and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * the other two to drop packets:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * CREDIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * - <--------------------------|------------------------> +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * | | | 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * | Large pkt |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * | drop thresh |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * Small pkt drop Mark threshold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * The effect of marking depends on the type of packet:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * a) If the packet is ECN enabled and it is a TCP packet, then the packet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * is ECN marked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * to reduce the congestion window. The current implementation uses a linear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * distribution (0% probability at marking threshold, 100% probability
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) * at drop threshold).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * c) If the packet is not a TCP packet, then it is dropped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) * If the credit is below the drop threshold, the packet is dropped. If it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) * is a TCP packet, then it also calls tcp_cwr since packets dropped by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * by a cgroup skb BPF program do not automatically trigger a call to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * tcp_cwr in the current kernel code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * This BPF program actually uses 2 drop thresholds, one threshold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * for larger packets (>= 120 bytes) and another for smaller packets. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * protects smaller packets such as SYNs, ACKs, etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * The default bandwidth limit is set at 1Gbps but this can be changed by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * a user program through a shared BPF map. In addition, by default this BPF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * program does not limit connections using loopback. This behavior can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * overwritten by the user program. There is also an option to calculate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * some statistics, such as percent of packets marked or dropped, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * the user program can access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * A latter patch provides such a program (hbm.c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #include "hbm_kern.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) SEC("cgroup_skb/egress")
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) int _hbm_out_cg(struct __sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) struct hbm_pkt_info pkti;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) int len = skb->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) unsigned int queue_index = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) unsigned long long curtime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) int credit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) signed long long delta = 0, new_credit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) int max_credit = MAX_CREDIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) bool congestion_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) bool drop_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) bool cwr_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) bool ecn_ce_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) struct hbm_vqueue *qdp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) struct hbm_queue_stats *qsp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) int rv = ALLOW_PKT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) return ALLOW_PKT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) hbm_get_pkt_info(skb, &pkti);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) // We may want to account for the length of headers in len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) // calculation, like ETH header + overhead, specially if it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) // is a gso packet. But I am not doing it right now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) qdp = bpf_get_local_storage(&queue_state, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) if (!qdp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) return ALLOW_PKT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) else if (qdp->lasttime == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) hbm_init_vqueue(qdp, 1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) curtime = bpf_ktime_get_ns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) // Begin critical section
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) bpf_spin_lock(&qdp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) credit = qdp->credit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) delta = curtime - qdp->lasttime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) /* delta < 0 implies that another process with a curtime greater
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * than ours beat us to the critical section and already added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) * the new credit, so we should not add it ourselves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) if (delta > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) qdp->lasttime = curtime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) new_credit = credit + CREDIT_PER_NS(delta, qdp->rate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) if (new_credit > MAX_CREDIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) credit = MAX_CREDIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) credit = new_credit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) credit -= len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) qdp->credit = credit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) bpf_spin_unlock(&qdp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) // End critical section
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) // Check if we should update rate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) qdp->rate = qsp->rate * 128;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) (int)qdp->rate,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) CREDIT_PER_NS(1000000000, qdp->rate) * 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) // Set flags (drop, congestion, cwr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) // Dropping => we are congested, so ignore congestion flag
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) if (credit < -DROP_THRESH ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) // Very congested, set drop packet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) drop_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) if (pkti.ecn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) congestion_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) else if (pkti.is_tcp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) cwr_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) } else if (credit < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) // Congested, set congestion flag
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) if (pkti.ecn || pkti.is_tcp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) if (credit < -MARK_THRESH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) congestion_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) congestion_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) congestion_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) if (congestion_flag) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) if (bpf_skb_ecn_set_ce(skb)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) ecn_ce_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) if (pkti.is_tcp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) unsigned int rand = bpf_get_prandom_u32();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) if (-credit >= MARK_THRESH +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) (rand % MARK_REGION_SIZE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) // Do congestion control
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) cwr_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) } else if (len > LARGE_PKT_THRESH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) // Problem if too many small packets?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) drop_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) if (qsp != NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) if (qsp->no_cn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) cwr_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) cwr_flag, ecn_ce_flag, &pkti, credit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) if (drop_flag) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) __sync_add_and_fetch(&(qdp->credit), len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) rv = DROP_PKT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) if (cwr_flag)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) rv |= 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) return rv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) char _license[] SEC("license") = "GPL";