// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* Example program for Host Bandwidth Managment
*
* This program loads a cgroup skb BPF program to enforce cgroup output
* (egress) or input (ingress) bandwidth limits.
*
* USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
* Where:
* -d Print BPF trace debug buffer
* -l Also limit flows doing loopback
* -n <#> To create cgroup \"/hbm#\" and attach prog
* Default is /hbm1
* --no_cn Do not return cn notifications
* -r <rate> Rate limit in Mbps
* -s Get HBM stats (marked, dropped, etc.)
* -t <time> Exit after specified seconds (default is 0)
* -w Work conserving flag. cgroup can increase its bandwidth
* beyond the rate limit specified while there is available
* bandwidth. Current implementation assumes there is only
* NIC (eth0), but can be extended to support multiple NICs.
* Currrently only supported for egress.
* -h Print this info
* prog BPF program file name. Name defaults to hbm_out_kern.o
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/unistd.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
#include <getopt.h>
#include "bpf_load.h"
#include "bpf_rlimit.h"
#include "cgroup_helpers.h"
#include "hbm.h"
#include "bpf_util.h"
#include "bpf.h"
#include "libbpf.h"
bool outFlag = true;
int minRate = 1000; /* cgroup rate limit in Mbps */
int rate = 1000; /* can grow if rate conserving is enabled */
int dur = 1;
bool stats_flag;
bool loopback_flag;
bool debugFlag;
bool work_conserving_flag;
bool no_cn_flag;
bool edt_flag;
static void Usage(void);
static void read_trace_pipe2(void);
static void do_error(char *msg, bool errno_flag);
#define DEBUGFS "/sys/kernel/debug/tracing/"
struct bpf_object *obj;
int bpfprog_fd;
int cgroup_storage_fd;
static void read_trace_pipe2(void)
{
int trace_fd;
FILE *outf;
char *outFname = "hbm_out.log";
trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
if (trace_fd < 0) {
printf("Error opening trace_pipe\n");
return;
}
// Future support of ingress
// if (!outFlag)
// outFname = "hbm_in.log";
outf = fopen(outFname, "w");
if (outf == NULL)
printf("Error creating %s\n", outFname);
while (1) {
static char buf[4097];
ssize_t sz;
sz = read(trace_fd, buf, sizeof(buf) - 1);
if (sz > 0) {
buf[sz] = 0;
puts(buf);
if (outf != NULL) {
fprintf(outf, "%s\n", buf);
fflush(outf);
}
}
}
}
static void do_error(char *msg, bool errno_flag)
{
if (errno_flag)
printf("ERROR: %s, errno: %d\n", msg, errno);
else
printf("ERROR: %s\n", msg);
exit(1);
}
static int prog_load(char *prog)
{
struct bpf_prog_load_attr prog_load_attr = {
.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
.file = prog,
.expected_attach_type = BPF_CGROUP_INET_EGRESS,
};
int map_fd;
struct bpf_map *map;
int ret = 0;
if (access(prog, O_RDONLY) < 0) {
printf("Error accessing file %s: %s\n", prog, strerror(errno));
return 1;
}
if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
ret = 1;
if (!ret) {
map = bpf_object__find_map_by_name(obj, "queue_stats");
map_fd = bpf_map__fd(map);
if (map_fd < 0) {
printf("Map not found: %s\n", strerror(map_fd));
ret = 1;
}
}
if (ret) {
printf("ERROR: load_bpf_file failed for: %s\n", prog);
printf(" Output from verifier:\n%s\n------\n", bpf_log_buf);
ret = -1;
} else {
ret = map_fd;
}
return ret;
}
static int run_bpf_prog(char *prog, int cg_id)
{
int map_fd;
int rc = 0;
int key = 0;
int cg1 = 0;
int type = BPF_CGROUP_INET_EGRESS;
char cg_dir[100];
struct hbm_queue_stats qstats = {0};
sprintf(cg_dir, "/hbm%d", cg_id);
map_fd = prog_load(prog);
if (map_fd == -1)
return 1;
if (setup_cgroup_environment()) {
printf("ERROR: setting cgroup environment\n");
goto err;
}
cg1 = create_and_get_cgroup(cg_dir);
if (!cg1) {
printf("ERROR: create_and_get_cgroup\n");
goto err;
}
if (join_cgroup(cg_dir)) {
printf("ERROR: join_cgroup\n");
goto err;
}
qstats.rate = rate;
qstats.stats = stats_flag ? 1 : 0;
qstats.loopback = loopback_flag ? 1 : 0;
qstats.no_cn = no_cn_flag ? 1 : 0;
if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
printf("ERROR: Could not update map element\n");
goto err;
}
if (!outFlag)
type = BPF_CGROUP_INET_INGRESS;
if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
printf("ERROR: bpf_prog_attach fails!\n");
log_err("Attaching prog");
goto err;
}
if (work_conserving_flag) {
struct timeval t0, t_last, t_new;
FILE *fin;
unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
signed long long last_cg_tx_bytes, new_cg_tx_bytes;
signed long long delta_time, delta_bytes, delta_rate;
int delta_ms;
#define DELTA_RATE_CHECK 10000 /* in us */
#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */
bpf_map_lookup_elem(map_fd, &key, &qstats);
if (gettimeofday(&t0, NULL) < 0)
do_error("gettimeofday failed", true);
t_last = t0;
fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
do_error("fscanf fails", false);
fclose(fin);
last_cg_tx_bytes = qstats.bytes_total;
while (true) {
usleep(DELTA_RATE_CHECK);
if (gettimeofday(&t_new, NULL) < 0)
do_error("gettimeofday failed", true);
delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
(t_new.tv_usec - t0.tv_usec)/1000;
if (delta_ms > dur * 1000)
break;
delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
(t_new.tv_usec - t_last.tv_usec);
if (delta_time == 0)
continue;
t_last = t_new;
fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
"r");
if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
do_error("fscanf fails", false);
fclose(fin);
printf(" new_eth_tx_bytes:%llu\n",
new_eth_tx_bytes);
bpf_map_lookup_elem(map_fd, &key, &qstats);
new_cg_tx_bytes = qstats.bytes_total;
delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
last_eth_tx_bytes = new_eth_tx_bytes;
delta_rate = (delta_bytes * 8000000) / delta_time;
printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
delta_ms, delta_rate/1000000000.0,
rate/1000.0);
if (delta_rate < RATE_THRESHOLD) {
/* can increase cgroup rate limit, but first
* check if we are using the current limit.
* Currently increasing by 6.25%, unknown
* if that is the optimal rate.
*/
int rate_diff100;
delta_bytes = new_cg_tx_bytes -
last_cg_tx_bytes;
last_cg_tx_bytes = new_cg_tx_bytes;
delta_rate = (delta_bytes * 8000000) /
delta_time;
printf(" rate:%.3fGbps",
delta_rate/1000000000.0);
rate_diff100 = (((long long)rate)*1000000 -
delta_rate) * 100 /
(((long long) rate) * 1000000);
printf(" rdiff:%d", rate_diff100);
if (rate_diff100 <= 3) {
rate += (rate >> 4);
if (rate > RATE_THRESHOLD / 1000000)
rate = RATE_THRESHOLD / 1000000;
qstats.rate = rate;
printf(" INC\n");
} else {
printf("\n");
}
} else {
/* Need to decrease cgroup rate limit.
* Currently decreasing by 12.5%, unknown
* if that is optimal
*/
printf(" DEC\n");
rate -= (rate >> 3);
if (rate < minRate)
rate = minRate;
qstats.rate = rate;
}
if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
do_error("update map element fails", false);
}
} else {
sleep(dur);
}
// Get stats!
if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
char fname[100];
FILE *fout;
if (!outFlag)
sprintf(fname, "hbm.%d.in", cg_id);
else
sprintf(fname, "hbm.%d.out", cg_id);
fout = fopen(fname, "w");
fprintf(fout, "id:%d\n", cg_id);
fprintf(fout, "ERROR: Could not lookup queue_stats\n");
} else if (stats_flag && qstats.lastPacketTime >
qstats.firstPacketTime) {
long long delta_us = (qstats.lastPacketTime -
qstats.firstPacketTime)/1000;
unsigned int rate_mbps = ((qstats.bytes_total -
qstats.bytes_dropped) * 8 /
delta_us);
double percent_pkts, percent_bytes;
char fname[100];
FILE *fout;
int k;
static const char *returnValNames[] = {
"DROP_PKT",
"ALLOW_PKT",
"DROP_PKT_CWR",
"ALLOW_PKT_CWR"
};
#define RET_VAL_COUNT 4
// Future support of ingress
// if (!outFlag)
// sprintf(fname, "hbm.%d.in", cg_id);
// else
sprintf(fname, "hbm.%d.out", cg_id);
fout = fopen(fname, "w");
fprintf(fout, "id:%d\n", cg_id);
fprintf(fout, "rate_mbps:%d\n", rate_mbps);
fprintf(fout, "duration:%.1f secs\n",
(qstats.lastPacketTime - qstats.firstPacketTime) /
1000000000.0);
fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
1000000));
fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
fprintf(fout, "bytes_dropped_MB:%d\n",
(int)(qstats.bytes_dropped /
1000000));
// Marked Pkts and Bytes
percent_pkts = (qstats.pkts_marked * 100.0) /
(qstats.pkts_total + 1);
percent_bytes = (qstats.bytes_marked * 100.0) /
(qstats.bytes_total + 1);
fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
// Dropped Pkts and Bytes
percent_pkts = (qstats.pkts_dropped * 100.0) /
(qstats.pkts_total + 1);
percent_bytes = (qstats.bytes_dropped * 100.0) /
(qstats.bytes_total + 1);
fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
// ECN CE markings
percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
(qstats.pkts_total + 1);
fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
(int)qstats.pkts_ecn_ce);
// Average cwnd
fprintf(fout, "avg cwnd:%d\n",
(int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
// Average rtt
fprintf(fout, "avg rtt:%d\n",
(int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
// Average credit
if (edt_flag)
fprintf(fout, "avg credit_ms:%.03f\n",
(qstats.sum_credit /
(qstats.pkts_total + 1.0)) / 1000000.0);
else
fprintf(fout, "avg credit:%d\n",
(int)(qstats.sum_credit /
(1500 * ((int)qstats.pkts_total ) + 1)));
// Return values stats
for (k = 0; k < RET_VAL_COUNT; k++) {
percent_pkts = (qstats.returnValCount[k] * 100.0) /
(qstats.pkts_total + 1);
fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
percent_pkts, (int)qstats.returnValCount[k]);
}
fclose(fout);
}
if (debugFlag)
read_trace_pipe2();
return rc;
err:
rc = 1;
if (cg1)
close(cg1);
cleanup_cgroup_environment();
return rc;
}
static void Usage(void)
{
printf("This program loads a cgroup skb BPF program to enforce\n"
"cgroup output (egress) bandwidth limits.\n\n"
"USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
" [-s] [-t <secs>] [-w] [-h] [prog]\n"
" Where:\n"
" -o indicates egress direction (default)\n"
" -d print BPF trace debug buffer\n"
" --edt use fq's Earliest Departure Time\n"
" -l also limit flows using loopback\n"
" -n <#> to create cgroup \"/hbm#\" and attach prog\n"
" Default is /hbm1\n"
" --no_cn disable CN notifications\n"
" -r <rate> Rate in Mbps\n"
" -s Update HBM stats\n"
" -t <time> Exit after specified seconds (default is 0)\n"
" -w Work conserving flag. cgroup can increase\n"
" bandwidth beyond the rate limit specified\n"
" while there is available bandwidth. Current\n"
" implementation assumes there is only eth0\n"
" but can be extended to support multiple NICs\n"
" -h print this info\n"
" prog BPF program file name. Name defaults to\n"
" hbm_out_kern.o\n");
}
int main(int argc, char **argv)
{
char *prog = "hbm_out_kern.o";
int k;
int cg_id = 1;
char *optstring = "iodln:r:st:wh";
struct option loptions[] = {
{"no_cn", 0, NULL, 1},
{"edt", 0, NULL, 2},
{NULL, 0, NULL, 0}
};
while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
switch (k) {
case 1:
no_cn_flag = true;
break;
case 2:
prog = "hbm_edt_kern.o";
edt_flag = true;
break;
case'o':
break;
case 'd':
debugFlag = true;
break;
case 'l':
loopback_flag = true;
break;
case 'n':
cg_id = atoi(optarg);
break;
case 'r':
minRate = atoi(optarg) * 1.024;
rate = minRate;
break;
case 's':
stats_flag = true;
break;
case 't':
dur = atoi(optarg);
break;
case 'w':
work_conserving_flag = true;
break;
case '?':
if (optopt == 'n' || optopt == 'r' || optopt == 't')
fprintf(stderr,
"Option -%c requires an argument.\n\n",
optopt);
case 'h':
// fallthrough
default:
Usage();
return 0;
}
}
if (optind < argc)
prog = argv[optind];
printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
return run_bpf_prog(prog, cg_id);
}