^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /* Copyright (c) 2019 Facebook
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * This program is free software; you can redistribute it and/or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * modify it under the terms of version 2 of the GNU General Public
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * License as published by the Free Software Foundation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Sample Host Bandwidth Manager (HBM) BPF program.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * A cgroup skb BPF egress program to limit cgroup output bandwidth.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * It uses a modified virtual token bucket queue to limit average
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * egress bandwidth. The implementation uses credits instead of tokens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * Negative credits imply that queueing would have happened (this is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * a virtual queue, so no queueing is done by it. However, queueing may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * occur at the actual qdisc (which is not used for rate limiting).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * This implementation uses 3 thresholds, one to start marking packets and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * the other two to drop packets:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * CREDIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * - <--------------------------|------------------------> +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * | | | 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * | Large pkt |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * | drop thresh |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * Small pkt drop Mark threshold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * The effect of marking depends on the type of packet:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * a) If the packet is ECN enabled and it is a TCP packet, then the packet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * is ECN marked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * to reduce the congestion window. The current implementation uses a linear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * distribution (0% probability at marking threshold, 100% probability
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) * at drop threshold).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * c) If the packet is not a TCP packet, then it is dropped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) * If the credit is below the drop threshold, the packet is dropped. If it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) * is a TCP packet, then it also calls tcp_cwr since packets dropped by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * by a cgroup skb BPF program do not automatically trigger a call to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * tcp_cwr in the current kernel code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * This BPF program actually uses 2 drop thresholds, one threshold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * for larger packets (>= 120 bytes) and another for smaller packets. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * protects smaller packets such as SYNs, ACKs, etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * The default bandwidth limit is set at 1Gbps but this can be changed by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * a user program through a shared BPF map. In addition, by default this BPF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * program does not limit connections using loopback. This behavior can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * overwritten by the user program. There is also an option to calculate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * some statistics, such as percent of packets marked or dropped, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * a user program, such as hbm, can access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #include "hbm_kern.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) SEC("cgroup_skb/egress")
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) int _hbm_out_cg(struct __sk_buff *skb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) long long delta = 0, delta_send;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) unsigned long long curtime, sendtime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) struct hbm_queue_stats *qsp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) unsigned int queue_index = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) bool congestion_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) bool ecn_ce_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) struct hbm_pkt_info pkti = {};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) struct hbm_vqueue *qdp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) bool drop_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) bool cwr_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) int len = skb->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) int rv = ALLOW_PKT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) // Check if we should ignore loopback traffic
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) return ALLOW_PKT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) hbm_get_pkt_info(skb, &pkti);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) // We may want to account for the length of headers in len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) // calculation, like ETH header + overhead, specially if it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) // is a gso packet. But I am not doing it right now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) qdp = bpf_get_local_storage(&queue_state, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) if (!qdp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) return ALLOW_PKT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) if (qdp->lasttime == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) hbm_init_edt_vqueue(qdp, 1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) curtime = bpf_ktime_get_ns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) // Begin critical section
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) bpf_spin_lock(&qdp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) delta = qdp->lasttime - curtime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) // bound bursts to 100us
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) if (delta < -BURST_SIZE_NS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) // negative delta is a credit that allows bursts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) qdp->lasttime = curtime - BURST_SIZE_NS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) delta = -BURST_SIZE_NS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) sendtime = qdp->lasttime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) delta_send = BYTES_TO_NS(len, qdp->rate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) __sync_add_and_fetch(&(qdp->lasttime), delta_send);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) bpf_spin_unlock(&qdp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) // End critical section
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) // Set EDT of packet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) skb->tstamp = sendtime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) // Check if we should update rate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) if (qsp != NULL && (qsp->rate * 128) != qdp->rate)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) qdp->rate = qsp->rate * 128;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) // Set flags (drop, congestion, cwr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) // last packet will be sent in the future, bound latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) len > LARGE_PKT_THRESH)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) drop_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) if (pkti.is_tcp && pkti.ecn == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) cwr_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) } else if (delta > MARK_THRESH_NS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) if (pkti.is_tcp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) congestion_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) drop_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) if (congestion_flag) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) if (bpf_skb_ecn_set_ce(skb)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) ecn_ce_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) if (pkti.is_tcp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) unsigned int rand = bpf_get_prandom_u32();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) if (delta >= MARK_THRESH_NS +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) (rand % MARK_REGION_SIZE_NS)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) // Do congestion control
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) cwr_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) } else if (len > LARGE_PKT_THRESH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) // Problem if too many small packets?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) drop_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) congestion_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) drop_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) cwr_flag = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) congestion_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) if (qsp != NULL && qsp->no_cn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) cwr_flag = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) cwr_flag, ecn_ce_flag, &pkti, (int) delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) if (drop_flag) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) __sync_add_and_fetch(&(qdp->lasttime), -delta_send);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) rv = DROP_PKT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) if (cwr_flag)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) rv |= CWR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) return rv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) char _license[] SEC("license") = "GPL";