#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#if INET6
#include <netinet/ip6.h>
#endif
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_cc.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_seq.h>
#include <kern/task.h>
#include <libkern/OSAtomic.h>
static int tcp_cubic_init(struct tcpcb *tp);
static int tcp_cubic_cleanup(struct tcpcb *tp);
static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp);
static void tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th);
static void tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th);
static void tcp_cubic_pre_fr(struct tcpcb *tp);
static void tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th);
static void tcp_cubic_after_timeout(struct tcpcb *tp);
static int tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th);
static void tcp_cubic_switch_cc(struct tcpcb *tp, u_int16_t old_index);
static uint32_t tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt);
static uint32_t tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th);
static inline void tcp_cubic_clear_state(struct tcpcb *tp);
extern float cbrtf(float x);
struct tcp_cc_algo tcp_cc_cubic = {
.name = "cubic",
.init = tcp_cubic_init,
.cleanup = tcp_cubic_cleanup,
.cwnd_init = tcp_cubic_cwnd_init_or_reset,
.congestion_avd = tcp_cubic_congestion_avd,
.ack_rcvd = tcp_cubic_ack_rcvd,
.pre_fr = tcp_cubic_pre_fr,
.post_fr = tcp_cubic_post_fr,
.after_idle = tcp_cubic_cwnd_init_or_reset,
.after_timeout = tcp_cubic_after_timeout,
.delay_ack = tcp_cubic_delay_ack,
.switch_to = tcp_cubic_switch_cc
};
const float tcp_cubic_backoff = 0.2f;
const float tcp_cubic_coeff = 0.4f;
const float tcp_cubic_fast_convergence_factor = 0.875f;
SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_tcp_friendliness, CTLFLAG_RW | CTLFLAG_LOCKED,
static int, tcp_cubic_tcp_friendliness, 0, "Enable TCP friendliness");
SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_fast_convergence, CTLFLAG_RW | CTLFLAG_LOCKED,
static int, tcp_cubic_fast_convergence, 0, "Enable fast convergence");
SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_use_minrtt, CTLFLAG_RW | CTLFLAG_LOCKED,
static int, tcp_cubic_use_minrtt, 0, "use a min of 5 sec rtt");
static int
tcp_cubic_init(struct tcpcb *tp)
{
OSIncrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets);
VERIFY(tp->t_ccstate != NULL);
tcp_cubic_clear_state(tp);
return 0;
}
static int
tcp_cubic_cleanup(struct tcpcb *tp)
{
#pragma unused(tp)
OSDecrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets);
return 0;
}
static void
tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp)
{
VERIFY(tp->t_ccstate != NULL);
tcp_cubic_clear_state(tp);
tcp_cc_cwnd_init_or_reset(tp);
tp->t_pipeack = 0;
tcp_clear_pipeack_state(tp);
tp->t_bytes_acked = 0;
if (tp->t_inpcb->inp_stat->txbytes <= TCP_CC_CWND_INIT_BYTES
&& tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT)) {
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
}
tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
}
static uint32_t
tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt)
{
float K, var;
u_int32_t elapsed_time, win;
win = min(tp->snd_cwnd, tp->snd_wnd);
if (tp->t_ccstate->cub_last_max == 0) {
tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
}
if (tp->t_ccstate->cub_epoch_start == 0) {
tp->t_ccstate->cub_epoch_start = tcp_now;
if (tp->t_ccstate->cub_epoch_start == 0) {
tp->t_ccstate->cub_epoch_start = 1;
}
if (win < tp->t_ccstate->cub_last_max) {
VERIFY(current_task() == kernel_task);
K = (tp->t_ccstate->cub_last_max - win)
/ tp->t_maxseg / tcp_cubic_coeff;
K = cbrtf(K);
tp->t_ccstate->cub_epoch_period = K * TCP_RETRANSHZ;
tp->t_ccstate->cub_origin_point =
tp->t_ccstate->cub_last_max;
} else {
tp->t_ccstate->cub_epoch_period = 0;
tp->t_ccstate->cub_origin_point = win;
}
tp->t_ccstate->cub_target_win = 0;
}
VERIFY(tp->t_ccstate->cub_origin_point > 0);
elapsed_time = timer_diff(tcp_now, 0,
tp->t_ccstate->cub_epoch_start, 0);
if (tcp_cubic_use_minrtt) {
elapsed_time += max(tcp_cubic_use_minrtt, rtt);
} else {
elapsed_time += rtt;
}
var = (elapsed_time - tp->t_ccstate->cub_epoch_period) / TCP_RETRANSHZ;
var = var * var * var * (tcp_cubic_coeff * tp->t_maxseg);
tp->t_ccstate->cub_target_win = (u_int32_t)(tp->t_ccstate->cub_origin_point + var);
return tp->t_ccstate->cub_target_win;
}
#define TCP_CUBIC_ENABLE_TCPMODE(_tp_) \
((!soissrcrealtime((_tp_)->t_inpcb->inp_socket) && \
(_tp_)->t_ccstate->cub_mean_dev > (tp->t_maxseg << 1)) ? 1 : 0)
static uint32_t
tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th)
{
if (tp->t_ccstate->cub_tcp_win == 0) {
tp->t_ccstate->cub_tcp_win = min(tp->snd_cwnd, tp->snd_wnd);
tp->t_ccstate->cub_tcp_bytes_acked = 0;
} else {
tp->t_ccstate->cub_tcp_bytes_acked +=
BYTES_ACKED(th, tp);
if (tp->t_ccstate->cub_tcp_bytes_acked >=
tp->t_ccstate->cub_tcp_win) {
tp->t_ccstate->cub_tcp_bytes_acked -=
tp->t_ccstate->cub_tcp_win;
tp->t_ccstate->cub_tcp_win += tp->t_maxseg;
}
}
return tp->t_ccstate->cub_tcp_win;
}
static void
tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th)
{
u_int32_t cubic_target_win, tcp_win, rtt;
if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) {
return;
}
tp->t_bytes_acked += BYTES_ACKED(th, tp);
rtt = get_base_rtt(tp);
cubic_target_win = tcp_cubic_update(tp, rtt);
tcp_win = tcp_cubic_tcpwin(tp, th);
if (tp->snd_cwnd < tcp_win &&
(tcp_cubic_tcp_friendliness == 1 ||
TCP_CUBIC_ENABLE_TCPMODE(tp))) {
if (tp->t_bytes_acked >= tp->snd_cwnd) {
tp->t_bytes_acked -= tp->snd_cwnd;
tp->snd_cwnd = min(tcp_win, TCP_MAXWIN << tp->snd_scale);
}
} else {
if (cubic_target_win > tp->snd_cwnd) {
u_int64_t incr_win;
incr_win = tp->snd_cwnd * tp->t_maxseg;
incr_win /= (cubic_target_win - tp->snd_cwnd);
if (incr_win > 0 &&
tp->t_bytes_acked >= incr_win) {
tp->t_bytes_acked -= incr_win;
tp->snd_cwnd =
min((tp->snd_cwnd + tp->t_maxseg),
TCP_MAXWIN << tp->snd_scale);
}
}
}
}
static void
tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th)
{
if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) {
return;
}
if (tp->snd_cwnd >= tp->snd_ssthresh) {
tcp_cubic_congestion_avd(tp, th);
} else {
uint32_t acked, abc_lim, incr;
acked = BYTES_ACKED(th, tp);
abc_lim = (tcp_do_rfc3465_lim2 &&
tp->snd_nxt == tp->snd_max) ?
2 * tp->t_maxseg : tp->t_maxseg;
incr = min(acked, abc_lim);
tp->snd_cwnd += incr;
tp->snd_cwnd = min(tp->snd_cwnd,
TCP_MAXWIN << tp->snd_scale);
}
}
static void
tcp_cubic_pre_fr(struct tcpcb *tp)
{
u_int32_t win, avg;
int32_t dev;
tp->t_ccstate->cub_epoch_start = 0;
tp->t_ccstate->cub_tcp_win = 0;
tp->t_ccstate->cub_target_win = 0;
tp->t_ccstate->cub_tcp_bytes_acked = 0;
win = min(tp->snd_cwnd, tp->snd_wnd);
if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
tp->t_lossflightsize = tp->snd_max - tp->snd_una;
win = (max(tp->t_pipeack, tp->t_lossflightsize)) >> 1;
} else {
tp->t_lossflightsize = 0;
}
if (win < tp->t_ccstate->cub_last_max &&
tcp_cubic_fast_convergence == 1) {
tp->t_ccstate->cub_last_max = (u_int32_t)(win *
tcp_cubic_fast_convergence_factor);
} else {
tp->t_ccstate->cub_last_max = win;
}
if (tp->t_ccstate->cub_last_max == 0) {
tp->t_ccstate->cub_last_max = tp->snd_max - tp->snd_una;
}
if (tp->t_ccstate->cub_avg_lastmax == 0) {
tp->t_ccstate->cub_avg_lastmax = tp->t_ccstate->cub_last_max;
} else {
avg = tp->t_ccstate->cub_avg_lastmax;
avg = (avg << 6) - avg;
tp->t_ccstate->cub_avg_lastmax =
(avg + tp->t_ccstate->cub_last_max) >> 6;
}
dev = tp->t_ccstate->cub_avg_lastmax - tp->t_ccstate->cub_last_max;
if (dev < 0) {
dev = -dev;
}
if (tp->t_ccstate->cub_mean_dev == 0) {
tp->t_ccstate->cub_mean_dev = dev;
} else {
dev = dev + ((tp->t_ccstate->cub_mean_dev << 4)
- tp->t_ccstate->cub_mean_dev);
tp->t_ccstate->cub_mean_dev = dev >> 4;
}
win = (u_int32_t)(win - (win * tcp_cubic_backoff));
win = (win / tp->t_maxseg);
if (win < 2) {
win = 2;
}
tp->snd_ssthresh = win * tp->t_maxseg;
tcp_cc_resize_sndbuf(tp);
}
static void
tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th)
{
uint32_t flight_size = 0;
if (SEQ_LEQ(th->th_ack, tp->snd_max)) {
flight_size = tp->snd_max - th->th_ack;
}
if (SACK_ENABLED(tp) && tp->t_lossflightsize > 0) {
u_int32_t total_rxt_size = 0, ncwnd;
total_rxt_size = tcp_rxtseg_total_size(tp);
ncwnd = max(tp->t_pipeack, tp->t_lossflightsize);
if (total_rxt_size <= ncwnd) {
ncwnd = ncwnd - total_rxt_size;
}
ncwnd = min(ncwnd, (tp->t_maxseg << 6));
ncwnd = ncwnd >> 1;
flight_size = max(ncwnd, flight_size);
}
if (flight_size < tp->snd_ssthresh) {
tp->snd_cwnd = max(flight_size, tp->t_maxseg)
+ tp->t_maxseg;
} else {
tp->snd_cwnd = tp->snd_ssthresh;
}
tp->t_ccstate->cub_tcp_win = 0;
tp->t_ccstate->cub_target_win = 0;
tp->t_ccstate->cub_tcp_bytes_acked = 0;
}
static void
tcp_cubic_after_timeout(struct tcpcb *tp)
{
VERIFY(tp->t_ccstate != NULL);
if (tp->t_state < TCPS_ESTABLISHED &&
((int)(tp->snd_max - tp->snd_una) <= 1)) {
return;
}
if (!IN_FASTRECOVERY(tp)) {
tcp_cubic_clear_state(tp);
tcp_cubic_pre_fr(tp);
}
tp->snd_cwnd = tp->t_maxseg;
}
static int
tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th)
{
return tcp_cc_delay_ack(tp, th);
}
static void
tcp_cubic_switch_cc(struct tcpcb *tp, uint16_t old_cc_index)
{
#pragma unused(old_cc_index)
tcp_cubic_cwnd_init_or_reset(tp);
OSIncrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets);
}
static inline void
tcp_cubic_clear_state(struct tcpcb *tp)
{
tp->t_ccstate->cub_last_max = 0;
tp->t_ccstate->cub_epoch_start = 0;
tp->t_ccstate->cub_origin_point = 0;
tp->t_ccstate->cub_tcp_win = 0;
tp->t_ccstate->cub_tcp_bytes_acked = 0;
tp->t_ccstate->cub_epoch_period = 0;
tp->t_ccstate->cub_target_win = 0;
}