#define _IP_VHL
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <net/route.h>
#include <net/ntstat.h>
#include <net/if_var.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <mach/sdt.h>
#if INET6
#include <netinet6/in6_pcb.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#endif
#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_cc.h>
#if TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
#include <sys/kdebug.h>
#include <mach/sdt.h>
#if IPSEC
#include <netinet6/ipsec.h>
#endif
#if CONFIG_MACF_NET
#include <security/mac_framework.h>
#endif
#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
#define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
#ifdef notyet
extern struct mbuf *m_copypack();
#endif
int path_mtu_discovery = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW | CTLFLAG_LOCKED,
&path_mtu_discovery, 1, "Enable Path MTU Discovery");
int ss_fltsz = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW | CTLFLAG_LOCKED,
&ss_fltsz, 1, "Slow start flight size");
int ss_fltsz_local = 8;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW | CTLFLAG_LOCKED,
&ss_fltsz_local, 1, "Slow start flight size for local networks");
int tcp_do_tso = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED,
&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
int tcp_ecn_outbound = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound,
0, "Initiate ECN for outbound connections");
int tcp_ecn_inbound = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound,
0, "Allow ECN negotiation for inbound connections");
int tcp_packet_chaining = 50;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_packet_chaining,
0, "Enable TCP output packet chaining");
int tcp_output_unlocked = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_output_unlocked,
0, "Unlock TCP when sending packets down to IP");
int tcp_do_rfc3390 = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW | CTLFLAG_LOCKED,
&tcp_do_rfc3390, 1, "Calculate intial slowstart cwnd depending on MSS");
int tcp_min_iaj_win = MIN_IAJ_WIN;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, min_iaj_win, CTLFLAG_RW | CTLFLAG_LOCKED,
&tcp_min_iaj_win, 1, "Minimum recv win based on inter-packet arrival jitter");
int tcp_acc_iaj_react_limit = ACC_IAJ_REACT_LIMIT;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_react_limit, CTLFLAG_RW | CTLFLAG_LOCKED,
&tcp_acc_iaj_react_limit, 1, "Accumulated IAJ when receiver starts to react");
static int32_t packchain_newlist = 0;
static int32_t packchain_looped = 0;
static int32_t packchain_sent = 0;
#if IPSEC
extern int ipsec_bypass;
#endif
extern int slowlink_wsize;
#if IPFIREWALL
extern int fw_enable;
extern int fw_bypass;
#endif
extern vm_size_t so_cache_zone_element_size;
#if RANDOM_IP_ID
extern int ip_use_randomid;
#endif
extern u_int32_t dlil_filter_count;
extern u_int32_t kipf_count;
extern int tcp_recv_bg;
static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
struct mbuf *, int, int, int32_t);
static inline int is_tcp_recv_bg(struct socket *so);
static __inline__ u_int16_t
get_socket_id(struct socket * s)
{
u_int16_t val;
if (so_cache_zone_element_size == 0) {
return (0);
}
val = (u_int16_t)(((uintptr_t)s) / so_cache_zone_element_size);
if (val == 0) {
val = 0xffff;
}
return (val);
}
static inline int
is_tcp_recv_bg(struct socket *so)
{
return (so->so_traffic_mgt_flags & TRAFFIC_MGT_TCP_RECVBG);
}
int
tcp_output(struct tcpcb *tp)
{
struct socket *so = tp->t_inpcb->inp_socket;
int32_t len, recwin, sendwin, off;
int flags, error;
register struct mbuf *m;
struct ip *ip = NULL;
register struct ipovly *ipov = NULL;
#if INET6
struct ip6_hdr *ip6 = NULL;
#endif
register struct tcphdr *th;
u_char opt[TCP_MAXOLEN];
unsigned ipoptlen, optlen, hdrlen;
int idle, sendalot, lost = 0;
int i, sack_rxmit;
int tso = 0;
int sack_bytes_rxmt;
struct sackhole *p;
#ifdef IPSEC
unsigned ipsec_optlen = 0;
#endif
int last_off = 0;
int m_off;
int idle_time = 0;
struct mbuf *m_lastm = NULL;
struct mbuf *m_head = NULL;
struct mbuf *packetlist = NULL;
struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options;
#if INET6
int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ;
struct ip6_pktopts *inp6_pktopts = tp->t_inpcb->inp_depend6.inp6_outputopts;
#endif
short packchain_listadd = 0;
u_int16_t socket_id = get_socket_id(so);
int so_options = so->so_options;
struct rtentry *rt;
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
idle_time = tcp_now - tp->t_rcvtime;
if (idle && idle_time >= tp->t_rxtcur) {
if (CC_ALGO(tp)->after_idle != NULL)
CC_ALGO(tp)->after_idle(tp);
DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
struct tcpcb *, tp, struct tcphdr *, NULL,
int32_t, TCP_CC_IDLE_TIMEOUT);
}
tp->t_flags &= ~TF_LASTIDLE;
if (idle) {
if (tp->t_flags & TF_MORETOCOME) {
tp->t_flags |= TF_LASTIDLE;
idle = 0;
}
}
again:
KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
#if INET6
if (isipv6) {
KERNEL_DEBUG(DBG_LAYER_BEG,
((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
(((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
(tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
sendalot,0,0);
}
else
#endif
{
KERNEL_DEBUG(DBG_LAYER_BEG,
((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
(((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
(tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
sendalot,0,0);
rt = tp->t_inpcb->inp_route.ro_rt;
if (rt != NULL && (!(rt->rt_flags & RTF_UP) ||
rt->generation_id != route_generation)) {
struct ifnet *ifp;
struct in_ifaddr *ia;
somultipages(so, FALSE);
tp->t_flags &= ~TF_TSO;
if ((ia = ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr)) == NULL) {
if (tp->t_state >= TCPS_CLOSE_WAIT) {
tcp_drop(tp, EADDRNOTAVAIL);
return(EADDRNOTAVAIL);
}
if (!tp->t_timer[TCPT_REXMT]) {
tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
if (tp->t_timer[TCPT_PERSIST]) {
tp->t_timer[TCPT_PERSIST] = 0;
tp->t_rxtshift = 0;
tp->t_persist_stop = 0;
tp->rxt_start = 0;
}
}
if (tp->t_pktlist_head != NULL)
m_freem_list(tp->t_pktlist_head);
TCP_PKTLIST_CLEAR(tp);
if (so->so_flags & SOF_NOADDRAVAIL) {
tcp_drop(tp, EADDRNOTAVAIL);
return(EADDRNOTAVAIL);
}
else {
tcp_check_timer_state(tp);
return(0);
}
}
IFA_REMREF(&ia->ia_ifa);
RT_LOCK(rt);
if ((ifp = rt->rt_ifp) != NULL) {
somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
tcp_set_tso(tp, ifp);
}
if (rt->rt_flags & RTF_UP)
rt->generation_id = route_generation;
if (!path_mtu_discovery || ((rt != NULL) &&
(!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
tp->t_flags &= ~TF_PMTUD;
else
tp->t_flags |= TF_PMTUD;
RT_UNLOCK(rt);
}
}
if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
tcp_sack_adjust(tp);
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
sendwin = min(sendwin, slowlink_wsize);
flags = tcp_outflags[tp->t_state];
sack_rxmit = 0;
sack_bytes_rxmt = 0;
len = 0;
p = NULL;
if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
int32_t cwin;
cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
if (SEQ_GT(p->end, tp->snd_recover)) {
if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
p = NULL;
goto after_sack_rexmit;
} else
len = ((int32_t)min(cwin,
tp->snd_recover - p->rxmit));
} else
len = ((int32_t)min(cwin, p->end - p->rxmit));
if (len > 0) {
off = p->rxmit - tp->snd_una;
sack_rxmit = 1;
sendalot = 1;
tcpstat.tcps_sack_rexmits++;
tcpstat.tcps_sack_rexmit_bytes +=
min(len, tp->t_maxseg);
if (nstat_collect) {
nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, min(len, tp->t_maxseg), NSTAT_TX_FLAG_RETRANSMIT);
locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1);
locked_add_64(&tp->t_inpcb->inp_stat->txbytes, min(len, tp->t_maxseg));
tp->t_stat.txretransmitbytes += min(len, tp->t_maxseg);
}
}
else
len = 0;
}
after_sack_rexmit:
if (tp->t_flags & TF_NEEDFIN)
flags |= TH_FIN;
if (tp->t_flags & TF_NEEDSYN)
flags |= TH_SYN;
if (tp->t_force) {
if (sendwin == 0) {
if (off < so->so_snd.sb_cc)
flags &= ~TH_FIN;
sendwin = 1;
} else {
tp->t_timer[TCPT_PERSIST] = 0;
tp->t_rxtshift = 0;
tp->rxt_start = 0;
tp->t_persist_stop = 0;
}
}
if (sack_rxmit == 0) {
if (sack_bytes_rxmt == 0)
len = min(so->so_snd.sb_cc, sendwin) - off;
else {
int32_t cwin;
len = min(so->so_snd.sb_cc, tp->snd_wnd)
- off;
if (len > 0) {
cwin = tp->snd_cwnd -
(tp->snd_nxt - tp->sack_newdata) -
sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
len = imin(len, cwin);
}
else
len = 0;
}
}
if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
flags &= ~TH_SYN;
off--, len++;
if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
while (!(tp->t_flags & TF_SENDINPROG) &&
tp->t_pktlist_head != NULL) {
packetlist = tp->t_pktlist_head;
packchain_listadd = tp->t_lastchain;
packchain_sent++;
TCP_PKTLIST_CLEAR(tp);
tp->t_flags |= TF_SENDINPROG;
error = tcp_ip_output(so, tp, packetlist,
packchain_listadd, tp_inp_options,
(so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), 0);
tp->t_flags &= ~TF_SENDINPROG;
}
if ((tp->t_flags &
(TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
tp->t_flags &= ~TF_CLOSING;
(void) tcp_close(tp);
} else {
tcp_check_timer_state(tp);
}
KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
0,0,0,0,0);
return 0;
}
}
if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
len = 0;
flags &= ~TH_FIN;
}
if (len <= 0 && !(flags & TH_SYN)) {
len = 0;
if (sendwin == 0) {
tp->t_timer[TCPT_REXMT] = 0;
tp->t_rxtshift = 0;
tp->rxt_start = 0;
tp->snd_nxt = tp->snd_una;
if (tp->t_timer[TCPT_PERSIST] == 0)
tcp_setpersist(tp);
}
}
#if IPSEC
if (ipsec_bypass == 0)
ipsec_optlen = ipsec_hdrsiz_tcp(tp);
#endif
if (len > tp->t_maxseg) {
if ((tp->t_flags & TF_TSO) && tcp_do_tso &&
#if RANDOM_IP_ID
ip_use_randomid &&
#endif
kipf_count == 0 && dlil_filter_count == 0 &&
tp->rcv_numsacks == 0 && sack_rxmit == 0 && sack_bytes_rxmt == 0 &&
tp->t_inpcb->inp_options == NULL &&
tp->t_inpcb->in6p_options == NULL
#if IPSEC
&& ipsec_optlen == 0
#endif
#if IPFIREWALL
&& (fw_enable == 0 || fw_bypass)
#endif
) {
tso = 1;
sendalot = 0;
} else {
len = tp->t_maxseg;
sendalot = 1;
tso = 0;
}
}
if (sack_rxmit) {
if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
flags &= ~TH_FIN;
} else {
if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
flags &= ~TH_FIN;
}
recwin = tcp_sbspace(tp);
if (len) {
if (len >= tp->t_maxseg) {
tp->t_flags |= TF_MAXSEGSNT;
goto send;
}
if (!(tp->t_flags & TF_MORETOCOME) &&
(idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) &&
(tp->t_flags & TF_NOPUSH) == 0 &&
len + off >= so->so_snd.sb_cc) {
tp->t_flags &= ~TF_MAXSEGSNT;
goto send;
}
if (tp->t_force) {
tp->t_flags &= ~TF_MAXSEGSNT;
goto send;
}
if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
tp->t_flags &= ~TF_MAXSEGSNT;
goto send;
}
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
tp->t_flags &= ~TF_MAXSEGSNT;
goto send;
}
if (sack_rxmit)
goto send;
}
if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
int32_t adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
(tp->rcv_adv - tp->rcv_nxt);
if (adv >= (int32_t) (2 * tp->t_maxseg)) {
if ((tp->last_ack_sent != tp->rcv_nxt) || (((recwin + adv) >> tp->rcv_scale) > recwin))
goto send;
}
if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
goto send;
}
if (tp->t_flags & TF_ACKNOW)
goto send;
if ((flags & TH_RST) ||
((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
goto send;
if (SEQ_GT(tp->snd_up, tp->snd_una))
goto send;
if (flags & TH_FIN &&
((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
goto send;
if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) &&
tp->t_timer[TCPT_REXMT] == 0 &&
tp->t_timer[TCPT_PERSIST] == 0) {
tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
goto just_return;
}
if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
tp->t_timer[TCPT_PERSIST] == 0) {
tp->t_rxtshift = 0;
tp->rxt_start = 0;
tcp_setpersist(tp);
}
just_return:
while (!(tp->t_flags & TF_SENDINPROG) && tp->t_pktlist_head != NULL) {
packetlist = tp->t_pktlist_head;
packchain_listadd = tp->t_lastchain;
packchain_sent++;
TCP_PKTLIST_CLEAR(tp);
tp->t_flags |= TF_SENDINPROG;
error = tcp_ip_output(so, tp, packetlist, packchain_listadd,
tp_inp_options, (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), recwin);
tp->t_flags &= ~TF_SENDINPROG;
}
if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
tp->t_flags &= ~TF_CLOSING;
(void) tcp_close(tp);
} else {
tcp_check_timer_state(tp);
}
KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
return (0);
send:
optlen = 0;
#if INET6
if (isipv6)
hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
else
#endif
hdrlen = sizeof (struct tcpiphdr);
if (flags & TH_SYN) {
tp->snd_nxt = tp->iss;
if ((tp->t_flags & TF_NOOPT) == 0) {
u_short mss;
opt[0] = TCPOPT_MAXSEG;
opt[1] = TCPOLEN_MAXSEG;
mss = htons((u_short) tcp_mssopt(tp));
(void)memcpy(opt + 2, &mss, sizeof(mss));
optlen = TCPOLEN_MAXSEG;
if ((tp->t_flags & TF_REQ_SCALE) &&
((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_SCALE))) {
*((u_int32_t *)(opt + optlen)) = htonl(
TCPOPT_NOP << 24 |
TCPOPT_WINDOW << 16 |
TCPOLEN_WINDOW << 8 |
tp->request_r_scale);
optlen += 4;
}
}
}
if (tcp_ecn_inbound && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
flags |= TH_ECE;
tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
}
else {
tp->ecn_flags &= ~TE_SETUPRECEIVED;
}
}
}
else if (tcp_ecn_outbound && (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
flags |= (TH_ECE | TH_CWR);
tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
}
else {
tp->ecn_flags &= ~TE_SENDIPECT;
}
}
if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
!SEQ_LT(tp->snd_nxt, tp->snd_max)) {
flags |= TH_CWR;
tp->ecn_flags &= ~TE_SENDCWR;
}
if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
flags |= TH_ECE;
}
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(flags & TH_RST) == 0 &&
((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_TSTMP))) {
u_int32_t *lp = (u_int32_t *)(opt + optlen);
*lp++ = htonl(TCPOPT_TSTAMP_HDR);
*lp++ = htonl(tcp_now);
*lp = htonl(tp->ts_recent);
optlen += TCPOLEN_TSTAMP_APPA;
}
if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) {
if ((flags & TH_SYN) &&
(!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
u_char *bp;
bp = (u_char *)opt + optlen;
*bp++ = TCPOPT_SACK_PERMITTED;
*bp++ = TCPOLEN_SACK_PERMITTED;
optlen += TCPOLEN_SACK_PERMITTED;
}
if (TCPS_HAVEESTABLISHED(tp->t_state) &&
(tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
int nsack, sackoptlen, padlen;
u_char *bp = (u_char *)opt + optlen;
u_int32_t *lp;
nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
nsack = min(nsack, tp->rcv_numsacks);
sackoptlen = (2 + nsack * TCPOLEN_SACK);
padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
optlen += padlen;
while (padlen-- > 0)
*bp++ = TCPOPT_NOP;
tcpstat.tcps_sack_send_blocks++;
*bp++ = TCPOPT_SACK;
*bp++ = sackoptlen;
lp = (u_int32_t *)bp;
for (i = 0; i < nsack; i++) {
struct sackblk sack = tp->sackblks[i];
*lp++ = htonl(sack.start);
*lp++ = htonl(sack.end);
}
optlen += sackoptlen;
}
}
if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
u_char *bp = (u_char *)opt + optlen;
optlen += pad;
while (pad) {
*bp++ = TCPOPT_EOL;
pad--;
}
}
hdrlen += optlen;
#if INET6
if (isipv6)
ipoptlen = ip6_optlen(tp->t_inpcb);
else
#endif
{
if (tp_inp_options) {
ipoptlen = tp_inp_options->m_len -
offsetof(struct ipoption, ipopt_list);
} else
ipoptlen = 0;
}
#if IPSEC
ipoptlen += ipsec_optlen;
#endif
if (len + optlen + ipoptlen > tp->t_maxopd) {
flags &= ~TH_FIN;
if (tso) {
int32_t tso_maxlen;
tso_maxlen = tp->tso_max_segment_size ? tp->tso_max_segment_size : TCP_MAXWIN;
if (len > tso_maxlen - hdrlen - optlen) {
len = tso_maxlen - hdrlen - optlen;
len = len - (len % (tp->t_maxopd - optlen));
sendalot = 1;
} else if (tp->t_flags & TF_NEEDFIN)
sendalot = 1;
} else {
len = tp->t_maxopd - optlen - ipoptlen;
sendalot = 1;
}
}
#if INET6
if (max_linkhdr + hdrlen > MCLBYTES)
panic("tcphdr too big");
#else
if (max_linkhdr + hdrlen > MHLEN)
panic("tcphdr too big");
#endif
if (len) {
if (tp->t_force && len == 1)
tcpstat.tcps_sndprobe++;
else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
tcpstat.tcps_sndrexmitpack++;
tcpstat.tcps_sndrexmitbyte += len;
if (nstat_collect) {
nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, len, NSTAT_TX_FLAG_RETRANSMIT);
locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1);
locked_add_64(&tp->t_inpcb->inp_stat->txbytes, len);
tp->t_stat.txretransmitbytes += len;
}
} else {
tcpstat.tcps_sndpack++;
tcpstat.tcps_sndbyte += len;
if (nstat_collect) {
locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1);
locked_add_64(&tp->t_inpcb->inp_stat->txbytes, len);
}
}
#ifdef notyet
if ((m = m_copypack(so->so_snd.sb_mb, off,
(int)len, max_linkhdr + hdrlen)) == 0) {
error = ENOBUFS;
goto out;
}
m->m_len += hdrlen;
m->m_data -= hdrlen;
#else
m = NULL;
#if INET6
if (MHLEN < hdrlen + max_linkhdr) {
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL) {
error = ENOBUFS;
goto out;
}
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
error = ENOBUFS;
goto out;
}
m->m_data += max_linkhdr;
m->m_len = hdrlen;
}
#endif
if (len <= MHLEN - hdrlen - max_linkhdr) {
if (m == NULL) {
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL) {
error = ENOBUFS;
goto out;
}
m->m_data += max_linkhdr;
m->m_len = hdrlen;
}
if (so->so_snd.sb_mb == NULL || off < 0) {
if (m != NULL) m_freem(m);
error = 0;
goto out;
}
m_copydata(so->so_snd.sb_mb, off, (int) len,
mtod(m, caddr_t) + hdrlen);
m->m_len += len;
} else {
if (m != NULL) {
m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
if (m->m_next == 0) {
(void) m_free(m);
error = ENOBUFS;
goto out;
}
} else {
if (m_head != so->so_snd.sb_mb || sack_rxmit || last_off != off)
m_lastm = NULL;
last_off = off + len;
m_head = so->so_snd.sb_mb;
if (m_head == NULL) {
error = 0;
goto out;
}
if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, len, M_DONTWAIT, &m_lastm, &m_off)) == NULL) {
error = ENOBUFS;
goto out;
}
m->m_data += max_linkhdr;
m->m_len = hdrlen;
}
}
#endif
if (off + len == so->so_snd.sb_cc)
flags |= TH_PUSH;
} else {
if (tp->t_flags & TF_ACKNOW)
tcpstat.tcps_sndacks++;
else if (flags & (TH_SYN|TH_FIN|TH_RST))
tcpstat.tcps_sndctrl++;
else if (SEQ_GT(tp->snd_up, tp->snd_una))
tcpstat.tcps_sndurg++;
else
tcpstat.tcps_sndwinup++;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL) {
error = ENOBUFS;
goto out;
}
#if INET6
if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
MHLEN >= hdrlen) {
MH_ALIGN(m, hdrlen);
} else
#endif
m->m_data += max_linkhdr;
m->m_len = hdrlen;
}
m->m_pkthdr.rcvif = 0;
#if CONFIG_MACF_NET
mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
#endif
#if INET6
if (isipv6) {
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
tcp_fillheaders(tp, ip6, th);
if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
!SEQ_LT(tp->snd_nxt, tp->snd_max)) {
ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
}
} else
#endif
{
ip = mtod(m, struct ip *);
ipov = (struct ipovly *)ip;
th = (struct tcphdr *)(ip + 1);
tcp_fillheaders(tp, ip, th);
if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
!SEQ_LT(tp->snd_nxt, tp->snd_max)) {
ip->ip_tos = IPTOS_ECN_ECT0;
}
}
if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
tp->snd_nxt == tp->snd_max)
tp->snd_nxt--;
if (sack_rxmit == 0) {
if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
} else {
th->th_seq = htonl(p->rxmit);
p->rxmit += len;
tp->sackhint.sack_bytes_rexmit += len;
}
th->th_ack = htonl(tp->rcv_nxt);
tp->last_ack_sent = tp->rcv_nxt;
if (optlen) {
bcopy(opt, th + 1, optlen);
th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
}
th->th_flags = flags;
if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && recwin < (int)tp->t_maxseg)
recwin = 0;
if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
if (recwin > (int32_t)slowlink_wsize)
recwin = slowlink_wsize;
}
#if TRAFFIC_MGT
if (tcp_recv_bg == 1 || is_tcp_recv_bg(so)) {
if (tp->acc_iaj > tcp_acc_iaj_react_limit) {
uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg;
if (tp->iaj_rwintop == 0 ||
SEQ_LT(tp->iaj_rwintop, tp->rcv_adv))
tp->iaj_rwintop = tp->rcv_adv;
if (SEQ_LT(tp->iaj_rwintop, tp->rcv_nxt + min_iaj_win))
tp->iaj_rwintop = tp->rcv_nxt + min_iaj_win;
recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin);
}
}
#endif
if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
if (th->th_win == 0)
tp->t_flags |= TF_RXWIN0SENT;
else
tp->t_flags &= ~TF_RXWIN0SENT;
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
th->th_flags |= TH_URG;
} else
tp->snd_up = tp->snd_una;
m->m_pkthdr.len = hdrlen + len;
#if INET6
if (isipv6) {
m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
if (len + optlen)
th->th_sum = in_addword(th->th_sum,
htons((u_short)(optlen + len)));
}
else
#endif
{
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
if (len + optlen)
th->th_sum = in_addword(th->th_sum,
htons((u_short)(optlen + len)));
}
if (tso) {
#if INET6
if (isipv6)
m->m_pkthdr.csum_flags = CSUM_TSO_IPV6;
else
#endif
m->m_pkthdr.csum_flags = CSUM_TSO_IPV4;
m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
}
else
m->m_pkthdr.tso_segsz = 0;
if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
tcp_seq startseq = tp->snd_nxt;
if (flags & (TH_SYN|TH_FIN)) {
if (flags & TH_SYN)
tp->snd_nxt++;
if (flags & TH_FIN) {
tp->snd_nxt++;
tp->t_flags |= TF_SENTFIN;
}
}
if (sack_rxmit)
goto timer;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
if (tp->t_rtttime == 0) {
tp->t_rtttime = tcp_now;
tp->t_rtseq = startseq;
tcpstat.tcps_segstimed++;
}
}
timer:
if (tp->t_timer[TCPT_REXMT] == 0 &&
((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
tp->snd_nxt != tp->snd_una)) {
if (tp->t_timer[TCPT_PERSIST]) {
tp->t_timer[TCPT_PERSIST] = 0;
tp->t_rxtshift = 0;
tp->rxt_start = 0;
tp->t_persist_stop = 0;
}
tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
}
} else {
int xlen = len;
if (flags & TH_SYN)
++xlen;
if (flags & TH_FIN) {
++xlen;
tp->t_flags |= TF_SENTFIN;
}
if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
tp->snd_max = tp->snd_nxt + len;
}
#if TCPDEBUG
if (so_options & SO_DEBUG)
tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
#endif
#if INET6
if (isipv6) {
struct rtentry *rt6;
struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 };
unsigned int outif;
KERNEL_DEBUG(DBG_LAYER_BEG,
((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
(((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
(tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
0,0,0);
ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
tp->t_inpcb->in6p_route.ro_rt ?
tp->t_inpcb->in6p_route.ro_rt->rt_ifp
: NULL);
#if IPSEC
if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) {
m_freem(m);
error = ENOBUFS;
goto out;
}
#endif
m->m_pkthdr.socket_id = socket_id;
rt6 = tp->t_inpcb->in6p_route.ro_rt;
if (rt6 != NULL && rt6->rt_ifp != NULL
&& rt6->rt_ifp != lo_ifp)
set_packet_tclass(m, so, MBUF_TC_UNSPEC, 1);
DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, struct ip6_hdr *, ip6,
struct tcpcb *, tp, struct tcphdr *, th);
if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
ip6oa.ip6oa_boundif = tp->t_inpcb->inp_boundif;
ip6oa.ip6oa_nocell = (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
error = ip6_output(m, inp6_pktopts, &tp->t_inpcb->in6p_route,
(so_options & SO_DONTROUTE) | IPV6_OUTARGS, NULL, NULL,
&ip6oa);
if ((rt6 = tp->t_inpcb->in6p_route.ro_rt) != NULL &&
(outif = rt6->rt_ifp->if_index) != tp->t_inpcb->in6p_last_outif)
tp->t_inpcb->in6p_last_outif = outif;
} else
#endif
{
ip->ip_len = m->m_pkthdr.len;
ip->ip_ttl = tp->t_inpcb->inp_ip_ttl;
ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK);
KERNEL_DEBUG(DBG_LAYER_BEG,
((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
(((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
(tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
0,0,0);
if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
ip->ip_off |= IP_DF;
#if IPSEC
if (ipsec_bypass == 0)
ipsec_setsocket(m, so);
#endif
lost = 0;
m->m_pkthdr.socket_id = socket_id;
m->m_nextpkt = NULL;
if (tp->t_inpcb->inp_route.ro_rt != NULL &&
tp->t_inpcb->inp_route.ro_rt->rt_ifp != NULL &&
tp->t_inpcb->inp_route.ro_rt->rt_ifp != lo_ifp)
set_packet_tclass(m, so, MBUF_TC_UNSPEC, 0);
tp->t_pktlist_sentlen += len;
tp->t_lastchain++;
DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb,
struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th);
if (tp->t_pktlist_head != NULL) {
tp->t_pktlist_tail->m_nextpkt = m;
tp->t_pktlist_tail = m;
} else {
packchain_newlist++;
tp->t_pktlist_head = tp->t_pktlist_tail = m;
}
if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
(tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
(tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 ||
tp->t_lastchain >= tcp_packet_chaining) {
error = 0;
while (!(tp->t_flags & TF_SENDINPROG) &&
tp->t_pktlist_head != NULL) {
packetlist = tp->t_pktlist_head;
packchain_listadd = tp->t_lastchain;
packchain_sent++;
lost = tp->t_pktlist_sentlen;
TCP_PKTLIST_CLEAR(tp);
tp->t_flags |= TF_SENDINPROG;
error = tcp_ip_output(so, tp, packetlist,
packchain_listadd, tp_inp_options,
(so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), recwin);
tp->t_flags &= ~TF_SENDINPROG;
if (error) {
lost += tp->t_pktlist_sentlen;
break;
} else {
lost = 0;
}
}
if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
tp->t_flags &= ~TF_CLOSING;
(void) tcp_close(tp);
return (0);
}
}
else {
error = 0;
packchain_looped++;
tcpstat.tcps_sndtotal++;
goto again;
}
}
if (error) {
if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
if ((flags & TH_SYN) == 0) {
if (sack_rxmit) {
p->rxmit -= lost;
tp->sackhint.sack_bytes_rexmit -= lost;
} else
tp->snd_nxt -= lost;
}
}
out:
if (tp->t_pktlist_head != NULL)
m_freem_list(tp->t_pktlist_head);
TCP_PKTLIST_CLEAR(tp);
if (error == ENOBUFS) {
if (!tp->t_timer[TCPT_REXMT] &&
!tp->t_timer[TCPT_PERSIST])
tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
tp->snd_cwnd = tp->t_maxseg;
tp->t_bytes_acked = 0;
tcp_check_timer_state(tp);
KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
struct tcpcb *, tp, struct tcphdr *, NULL,
int32_t, TCP_CC_OUTPUT_ERROR);
return (0);
}
if (error == EMSGSIZE) {
if (tso)
tp->t_flags &= ~TF_TSO;
tcp_mtudisc(tp->t_inpcb, 0);
tcp_check_timer_state(tp);
KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
return 0;
}
if ((error == EHOSTUNREACH || error == ENETDOWN)
&& TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_softerror = error;
tcp_check_timer_state(tp);
KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
return (0);
}
tcp_check_timer_state(tp);
KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
return (error);
}
tcpstat.tcps_sndtotal++;
#if INET6
if (isipv6) {
if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + recwin;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
tp->t_timer[TCPT_DELACK] = 0;
tp->t_unacksegs = 0;
}
#endif
KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
if (sendalot)
goto again;
tcp_check_timer_state(tp);
return (0);
}
static int
tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
int cnt, struct mbuf *opt, int flags, int sack_in_progress, int recwin)
{
int error = 0;
boolean_t chain;
boolean_t unlocked = FALSE;
struct inpcb *inp = tp->t_inpcb;
struct ip_out_args ipoa;
struct route ro;
unsigned int outif;
ipoa.ipoa_boundif = (inp->inp_flags & INP_BOUND_IF) ?
inp->inp_boundif : IFSCOPE_NONE;
ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
flags |= IP_OUTARGS;
inp_route_copyout(inp, &ro);
if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + recwin;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
tp->t_timer[TCPT_DELACK] = 0;
tp->t_unacksegs = 0;
if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) &&
(tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) &&
((tp->t_flags & TF_FASTRECOVERY) == 0)) {
unlocked = TRUE;
socket_unlock(so, 0);
}
chain = tcp_packet_chaining > 1
#if IPSEC
&& ipsec_bypass
#endif
#if IPFIREWALL
&& (fw_enable == 0 || fw_bypass)
#endif
;
while (pkt != NULL) {
struct mbuf *npkt = pkt->m_nextpkt;
if (!chain) {
pkt->m_nextpkt = NULL;
cnt = 0;
}
error = ip_output_list(pkt, cnt, opt, &ro, flags, 0, &ipoa);
if (chain || error) {
if (!chain)
m_freem_list(npkt);
break;
}
pkt = npkt;
}
if (unlocked)
socket_lock(so, 0);
if (ro.ro_rt != NULL &&
(outif = ro.ro_rt->rt_ifp->if_index) != inp->inp_last_outif)
inp->inp_last_outif = outif;
inp_route_copyin(inp, &ro);
return (error);
}
void
tcp_setpersist(tp)
register struct tcpcb *tp;
{
int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
if ((tp->t_persist_timeout != 0) &&
(tp->t_timer[TCPT_PERSIST] == 0) &&
(tp->t_persist_stop == 0)) {
tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
}
TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
t * tcp_backoff[tp->t_rxtshift],
TCPTV_PERSMIN, TCPTV_PERSMAX,
TCP_ADD_REXMTSLOP(tp));
tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
}