mptcp_usrreq.c   [plain text]


/*
 * Copyright (c) 2012-2015 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/mcache.h>
#include <sys/syslog.h>
#include <sys/proc.h>
#include <sys/proc_internal.h>
#include <sys/resourcevar.h>

#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_timer.h>
#include <netinet/mptcp_var.h>
#include <netinet/mptcp_timer.h>

#include <mach/sdt.h>

static int mptcp_usr_attach(struct socket *, int, struct proc *);
static int mptcp_usr_detach(struct socket *);
static int mptcp_attach(struct socket *, struct proc *);
static int mptcp_detach(struct socket *, struct mppcb *);
static int mptcp_connectx(struct mptses *, struct sockaddr_list **,
    struct sockaddr_list **, struct proc *, uint32_t, sae_associd_t,
    sae_connid_t *, uint32_t, void *, uint32_t);
static int mptcp_usr_connectx(struct socket *, struct sockaddr_list **,
    struct sockaddr_list **, struct proc *, uint32_t, sae_associd_t,
    sae_connid_t *, uint32_t, void *, uint32_t, struct uio *, user_ssize_t *);
static int mptcp_getassocids(struct mptses *, uint32_t *, user_addr_t);
static int mptcp_getconnids(struct mptses *, sae_associd_t, uint32_t *,
    user_addr_t);
static int mptcp_getconninfo(struct mptses *, sae_connid_t *, uint32_t *,
    uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *,
    uint32_t *, user_addr_t, uint32_t *);
static int mptcp_usr_control(struct socket *, u_long, caddr_t, struct ifnet *,
    struct proc *);
static int mptcp_disconnectx(struct mptses *, sae_associd_t, sae_connid_t);
static int mptcp_usr_disconnect(struct socket *);
static int mptcp_usr_disconnectx(struct socket *, sae_associd_t, sae_connid_t);
static struct mptses *mptcp_usrclosed(struct mptses *);
static int mptcp_usr_peeloff(struct socket *, sae_associd_t, struct socket **);
static int mptcp_peeloff(struct mptses *, sae_associd_t, struct socket **);
static int mptcp_usr_rcvd(struct socket *, int);
static int mptcp_usr_send(struct socket *, int, struct mbuf *,
    struct sockaddr *, struct mbuf *, struct proc *);
static int mptcp_usr_shutdown(struct socket *);
static int mptcp_uiotombuf(struct uio *, int, int, uint32_t, struct mbuf **);
static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *,
    struct mbuf *, struct mbuf *, int);
static int mptcp_usr_socheckopt(struct socket *, struct sockopt *);
static int mptcp_setopt_apply(struct mptses *, struct mptopt *);
static int mptcp_setopt(struct mptses *, struct sockopt *);
static int mptcp_getopt(struct mptses *, struct sockopt *);
static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *);
static void mptcp_connorder_helper(struct mptsub *mpts);
static int mptcp_usr_preconnect(struct socket *so);

struct pr_usrreqs mptcp_usrreqs = {
	.pru_attach =		mptcp_usr_attach,
	.pru_connectx =		mptcp_usr_connectx,
	.pru_control =		mptcp_usr_control,
	.pru_detach =		mptcp_usr_detach,
	.pru_disconnect =	mptcp_usr_disconnect,
	.pru_disconnectx =	mptcp_usr_disconnectx,
	.pru_peeloff =		mptcp_usr_peeloff,
	.pru_rcvd =		mptcp_usr_rcvd,
	.pru_send =		mptcp_usr_send,
	.pru_shutdown =		mptcp_usr_shutdown,
	.pru_sosend =		mptcp_usr_sosend,
	.pru_soreceive =	soreceive,
	.pru_socheckopt =	mptcp_usr_socheckopt,
	.pru_preconnect =	mptcp_usr_preconnect,
};

/*
 * Attaches an MPTCP control block to a socket.
 */
static int
mptcp_usr_attach(struct socket *mp_so, int proto, struct proc *p)
{
#pragma unused(proto)
	int error;

	VERIFY(sotomppcb(mp_so) == NULL);

	error = mptcp_attach(mp_so, p);
	if (error != 0)
		goto out;
	/*
	 * XXX: adi@apple.com
	 *
	 * Might want to use a different SO_LINGER timeout than TCP's?
	 */
	if ((mp_so->so_options & SO_LINGER) && mp_so->so_linger == 0)
		mp_so->so_linger = TCP_LINGERTIME * hz;
out:
	return (error);
}

/*
 * Detaches an MPTCP control block from a socket.
 */
static int
mptcp_usr_detach(struct socket *mp_so)
{
	struct mppcb *mpp = sotomppcb(mp_so);
	int error = 0;

	VERIFY(mpp != NULL);
	VERIFY(mpp->mpp_socket != NULL);

	error = mptcp_detach(mp_so, mpp);
	return (error);
}

/*
 * Attach MPTCP protocol to socket, allocating MP control block,
 * MPTCP session, control block, buffer space, etc.
 */
static int
mptcp_attach(struct socket *mp_so, struct proc *p)
{
#pragma unused(p)
	struct mptses *mpte = NULL;
	struct mptcb *mp_tp = NULL;
	struct mppcb *mpp = NULL;
	int error = 0;

	if (mp_so->so_snd.sb_hiwat == 0 || mp_so->so_rcv.sb_hiwat == 0) {
		error = soreserve(mp_so, tcp_sendspace, MPTCP_RWIN_MAX);
		if (error != 0)
			goto out;
	}

	if (mp_so->so_snd.sb_preconn_hiwat == 0) {
		soreserve_preconnect(mp_so, 2048);
	}

	/*
	 * MPTCP socket buffers cannot be compressed, due to the
	 * fact that each mbuf chained via m_next is a M_PKTHDR
	 * which carries some MPTCP metadata.
	 */
	mp_so->so_snd.sb_flags |= SB_NOCOMPRESS;
	mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS;

	/* Disable socket buffer auto-tuning. */
	mp_so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
	mp_so->so_snd.sb_flags &= ~SB_AUTOSIZE;

	if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0) {
		goto out;
	}

	mpp = sotomppcb(mp_so);
	VERIFY(mpp != NULL);
	mpte = (struct mptses *)mpp->mpp_pcbe;
	VERIFY(mpte != NULL);
	mp_tp = mpte->mpte_mptcb;
	VERIFY(mp_tp != NULL);
out:
	return (error);
}

/*
 * Called when the socket layer loses its final reference to the socket;
 * at this point, there is only one case in which we will keep things
 * around: time wait.
 */
static int
mptcp_detach(struct socket *mp_so, struct mppcb *mpp)
{
	struct mptses *mpte;
	struct mppcbinfo *mppi;

	VERIFY(mp_so->so_pcb == mpp);
	VERIFY(mpp->mpp_socket == mp_so);

	mppi = mpp->mpp_pcbinfo;
	VERIFY(mppi != NULL);

	__IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
	VERIFY(mpte->mpte_mppcb == mpp);

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	/*
	 * We are done with this MPTCP socket (it has been closed);
	 * trigger all subflows to be disconnected, if not already,
	 * by initiating the PCB detach sequence (SOF_PCBCLEARING
	 * will be set.)
	 */
	mp_pcbdetach(mpp);

	(void) mptcp_disconnectx(mpte, SAE_ASSOCID_ALL, SAE_CONNID_ALL);

	/*
	 * XXX: adi@apple.com
	 *
	 * Here, we would want to handle time wait state.
	 */

	return (0);
}

/*
 * Common subroutine to open a MPTCP connection to one of the remote hosts
 * specified by dst_sl.  This includes allocating and establishing a
 * subflow TCP connection, either initially to establish MPTCP connection,
 * or to join an existing one.  Returns a connection handle upon success.
 */
static int
mptcp_connectx(struct mptses *mpte, struct sockaddr_list **src_sl,
    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
    sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
    uint32_t arglen)
{
#pragma unused(p, aid, flags, arg, arglen)
	struct mptsub *mpts;
	struct socket *mp_so;
	int error = 0;

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
	mp_so = mpte->mpte_mppcb->mpp_socket;

	VERIFY(dst_sl != NULL && *dst_sl != NULL);
	VERIFY(pcid != NULL);

	mptcplog((LOG_DEBUG, "MPTCP Socket: "
	    "%s: mp_so 0x%llx\n", __func__,
	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
	    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);

	DTRACE_MPTCP3(connectx, struct mptses *, mpte, sae_associd_t, aid,
	    struct socket *, mp_so);

	mpts = mptcp_subflow_alloc(M_WAITOK);
	if (mpts == NULL) {
		error = ENOBUFS;
		goto out;
	}
	MPTS_ADDREF(mpts);		/* for this routine */

	if (src_sl != NULL) {
		mpts->mpts_src_sl = *src_sl;
		*src_sl = NULL;
	}
	mpts->mpts_dst_sl = *dst_sl;
	*dst_sl = NULL;

	error = mptcp_subflow_add(mpte, mpts, p, ifscope);
	if (error == 0 && pcid != NULL)
		*pcid = mpts->mpts_connid;

out:
	if (mpts != NULL) {
		if ((error != 0) && (error != EWOULDBLOCK)) {
			MPTS_LOCK(mpts);
			if (mpts->mpts_flags & MPTSF_ATTACHED) {
				MPTS_UNLOCK(mpts);
				MPTS_REMREF(mpts);
				mptcp_subflow_del(mpte, mpts, TRUE);
				return (error);
			}
			MPTS_UNLOCK(mpts);
		}
		MPTS_REMREF(mpts);
	}

	return (error);
}

/*
 * User-protocol pru_connectx callback.
 */
static int
mptcp_usr_connectx(struct socket *mp_so, struct sockaddr_list **src_sl,
    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
    sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
    uint32_t arglen, struct uio *auio, user_ssize_t *bytes_written)
{
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte = NULL;
	struct mptcb *mp_tp = NULL;
	user_ssize_t	datalen;

	int error = 0;

	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
		error = EINVAL;
		goto out;
	}
	mpte = mptompte(mpp);
	VERIFY(mpte != NULL);

	mp_tp = mpte->mpte_mptcb;
	VERIFY(mp_tp != NULL);

	if (mp_tp->mpt_flags &  MPTCPF_FALLBACK_TO_TCP) {
		error = EINVAL;
		goto out;
	}

	error = mptcp_connectx(mpte, src_sl, dst_sl, p, ifscope,
	    aid, pcid, flags, arg, arglen);

	/* If there is data, copy it */
	if (auio != NULL) {
		datalen = uio_resid(auio);
		socket_unlock(mp_so, 0);
		error = mp_so->so_proto->pr_usrreqs->pru_sosend(mp_so, NULL,
		    (uio_t) auio, NULL, NULL, 0);
		/* check if this can be supported with fast Join also. XXX */
		if (error == 0 || error == EWOULDBLOCK)
			*bytes_written = datalen - uio_resid(auio);

		if (error == EWOULDBLOCK)
			error = EINPROGRESS;

		socket_lock(mp_so, 0);
		MPT_LOCK(mp_tp);
		if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
			*bytes_written = datalen - uio_resid(auio);
			/*
			 * Override errors like EPIPE that occur as
			 * a result of doing TFO during TCP fallback.
			 */
			error = EPROTO;
		}
		MPT_UNLOCK(mp_tp);
	}

out:
	return (error);
}

/*
 * Handle SIOCGASSOCIDS ioctl for PF_MULTIPATH domain.
 */
static int
mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp)
{
	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	/* MPTCP has at most 1 association */
	*cnt = (mpte->mpte_associd != SAE_ASSOCID_ANY) ? 1 : 0;

	/* just asking how many there are? */
	if (aidp == USER_ADDR_NULL)
		return (0);

	return (copyout(&mpte->mpte_associd, aidp,
	    sizeof (mpte->mpte_associd)));
}

/*
 * Handle SIOCGCONNIDS ioctl for PF_MULTIPATH domain.
 */
static int
mptcp_getconnids(struct mptses *mpte, sae_associd_t aid, uint32_t *cnt,
    user_addr_t cidp)
{
	struct mptsub *mpts;
	int error = 0;

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL &&
	    aid != mpte->mpte_associd)
		return (EINVAL);

	*cnt = mpte->mpte_numflows;

	/* just asking how many there are? */
	if (cidp == USER_ADDR_NULL)
		return (0);

	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
		if ((error = copyout(&mpts->mpts_connid, cidp,
		    sizeof (mpts->mpts_connid))) != 0)
			break;

		cidp += sizeof (mpts->mpts_connid);
	}

	return (error);
}

/*
 * Handle SIOCGCONNINFO ioctl for PF_MULTIPATH domain.
 */
static int
mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags,
    uint32_t *ifindex, int32_t *soerror, user_addr_t src, socklen_t *src_len,
    user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type,
    user_addr_t aux_data, uint32_t *aux_len)
{
#pragma unused(aux_data)
	struct sockaddr_entry *se;
	struct ifnet *ifp = NULL;
	struct mptsub *mpts;
	int error = 0;

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	if (*cid == SAE_CONNID_ALL)
		return (EINVAL);

	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
		if (mpts->mpts_connid == *cid || *cid == SAE_CONNID_ANY)
			break;
	}
	if (mpts == NULL)
		return ((*cid == SAE_CONNID_ANY) ? ENXIO : EINVAL);

	MPTS_LOCK(mpts);
	ifp = mpts->mpts_outif;
	*cid = mpts->mpts_connid;
	*ifindex = ((ifp != NULL) ? ifp->if_index : 0);
	*soerror = mpts->mpts_soerror;
	*flags = 0;
	if (mpts->mpts_flags & MPTSF_CONNECTING)
		*flags |= CIF_CONNECTING;
	if (mpts->mpts_flags & MPTSF_CONNECTED)
		*flags |= CIF_CONNECTED;
	if (mpts->mpts_flags & MPTSF_DISCONNECTING)
		*flags |= CIF_DISCONNECTING;
	if (mpts->mpts_flags & MPTSF_DISCONNECTED)
		*flags |= CIF_DISCONNECTED;
	if (mpts->mpts_flags & MPTSF_BOUND_IF)
		*flags |= CIF_BOUND_IF;
	if (mpts->mpts_flags & MPTSF_BOUND_IP)
		*flags |= CIF_BOUND_IP;
	if (mpts->mpts_flags & MPTSF_BOUND_PORT)
		*flags |= CIF_BOUND_PORT;
	if (mpts->mpts_flags & MPTSF_PREFERRED)
		*flags |= CIF_PREFERRED;
	if (mpts->mpts_flags & MPTSF_MP_CAPABLE)
		*flags |= CIF_MP_CAPABLE;
	if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
		*flags |= CIF_MP_DEGRADED;
	if (mpts->mpts_flags & MPTSF_MP_READY)
		*flags |= CIF_MP_READY;
	if (mpts->mpts_flags & MPTSF_ACTIVE)
		*flags |= CIF_MP_ACTIVE;

	VERIFY(mpts->mpts_src_sl != NULL);
	se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
	VERIFY(se != NULL && se->se_addr != NULL);
	*src_len = se->se_addr->sa_len;
	if (src != USER_ADDR_NULL) {
		error = copyout(se->se_addr, src, se->se_addr->sa_len);
		if (error != 0)
			goto out;
	}

	VERIFY(mpts->mpts_dst_sl != NULL);
	se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
	VERIFY(se != NULL && se->se_addr != NULL);
	*dst_len = se->se_addr->sa_len;
	if (dst != USER_ADDR_NULL) {
		error = copyout(se->se_addr, dst, se->se_addr->sa_len);
		if (error != 0)
			goto out;
	}

	*aux_type = 0;
	*aux_len = 0;
	if (mpts->mpts_socket != NULL) {
		struct conninfo_tcp tcp_ci;
		
		*aux_type = CIAUX_TCP;
		*aux_len = sizeof (tcp_ci);
		
		if (aux_data != USER_ADDR_NULL) {
			struct socket *so = mpts->mpts_socket;

			VERIFY(SOCK_PROTO(so) == IPPROTO_TCP);
			bzero(&tcp_ci, sizeof (tcp_ci));
			socket_lock(so, 0);
			tcp_getconninfo(so, &tcp_ci);
			socket_unlock(so, 0);
			error = copyout(&tcp_ci, aux_data, sizeof (tcp_ci));
			if (error != 0)
				goto out;
		}
	}
	mptcplog((LOG_DEBUG, "MPTCP Socket: "
	    "%s: cid %d flags %x \n",
	    __func__, mpts->mpts_connid, mpts->mpts_flags),
	    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);

out:
	MPTS_UNLOCK(mpts);
	return (error);
}

/*
 * Handle SIOCSCONNORDER
 */
int
mptcp_setconnorder(struct mptses *mpte, sae_connid_t cid, uint32_t rank)
{
	struct mptsub *mpts, *mpts1;
	int error = 0;

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
	mptcplog((LOG_DEBUG, "MPTCP Socket: "
	    "%s: cid %d rank %d \n", __func__, cid, rank),
	    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);

	if (cid == SAE_CONNID_ANY || cid == SAE_CONNID_ALL) {
		error = EINVAL;
		goto out;
	}

	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
		if (mpts->mpts_connid == cid)
			break;
	}
	if (mpts == NULL) {
		error = ENXIO;
		goto out;
	}

	if (rank == 0 || rank > 1) {
		/*
		 * If rank is 0, determine whether this should be the
		 * primary or backup subflow, depending on what we have.
		 *
		 * Otherwise, if greater than 0, make it a backup flow.
		 */
		TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) {
			MPTS_LOCK(mpts1);
			if (mpts1->mpts_flags & MPTSF_PREFERRED) {
				MPTS_UNLOCK(mpts1);
				break;
			}
			MPTS_UNLOCK(mpts1);
		}

		MPTS_LOCK(mpts);
		mpts->mpts_flags &= ~MPTSF_PREFERRED;
		mpts->mpts_rank = rank;
		if (mpts1 != NULL && mpts != mpts1) {
			/* preferred subflow found; set rank as necessary */
			if (rank == 0)
				mpts->mpts_rank = (mpts1->mpts_rank + 1);
		} else if (rank == 0) {
			/* no preferred one found; promote this */
			rank = 1;
		}
		MPTS_UNLOCK(mpts);
	}

	if (rank == 1) {
		/*
		 * If rank is 1, promote this subflow to be preferred.
		 */
		TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) {
			MPTS_LOCK(mpts1);
			if (mpts1 != mpts &&
			    (mpts1->mpts_flags & MPTSF_PREFERRED)) {
				mpts1->mpts_flags &= ~MPTSF_PREFERRED;
				if (mpte->mpte_nummpcapflows > 1)
					mptcp_connorder_helper(mpts1);
			} else if (mpts1 == mpts) {
				mpts1->mpts_rank = 1;
				if (mpts1->mpts_flags & MPTSF_MP_CAPABLE) {
					mpts1->mpts_flags |= MPTSF_PREFERRED;
					if (mpte->mpte_nummpcapflows > 1)
						mptcp_connorder_helper(mpts1);
				}
			}
			MPTS_UNLOCK(mpts1);
		}
	}

out:
	return (error);
}

static void
mptcp_connorder_helper(struct mptsub *mpts)
{
	struct socket *so = mpts->mpts_socket;
	struct tcpcb *tp = NULL;

	socket_lock(so, 0);

	tp = intotcpcb(sotoinpcb(so));
	tp->t_mpflags |= TMPF_SND_MPPRIO;
	if (mpts->mpts_flags & MPTSF_PREFERRED)
		tp->t_mpflags &= ~TMPF_BACKUP_PATH;
	else
		tp->t_mpflags |= TMPF_BACKUP_PATH;

	socket_unlock(so, 0);

}

/*
 * Handle SIOCSGONNORDER
 */
int
mptcp_getconnorder(struct mptses *mpte, sae_connid_t cid, uint32_t *rank)
{
	struct mptsub *mpts;
	int error = 0;

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
	VERIFY(rank != NULL);
	*rank = 0;

	if (cid == SAE_CONNID_ANY || cid == SAE_CONNID_ALL) {
		error = EINVAL;
		goto out;
	}

	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
		if (mpts->mpts_connid == cid)
			break;
	}
	if (mpts == NULL) {
		error = ENXIO;
		goto out;
	}

	MPTS_LOCK(mpts);
	*rank = mpts->mpts_rank;
	MPTS_UNLOCK(mpts);
out:
	return (error);
}

/*
 * User-protocol pru_control callback.
 */
static int
mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data,
    struct ifnet *ifp, struct proc *p)
{
#pragma unused(ifp, p)
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte;
	int error = 0;

	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
		error = EINVAL;
		goto out;
	}
	mpte = mptompte(mpp);
	VERIFY(mpte != NULL);

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	switch (cmd) {
	case SIOCGASSOCIDS32: {		/* struct so_aidreq32 */
		struct so_aidreq32 aidr;
		bcopy(data, &aidr, sizeof (aidr));
		error = mptcp_getassocids(mpte, &aidr.sar_cnt,
		    aidr.sar_aidp);
		if (error == 0)
			bcopy(&aidr, data, sizeof (aidr));
		break;
	}

	case SIOCGASSOCIDS64: {		/* struct so_aidreq64 */
		struct so_aidreq64 aidr;
		bcopy(data, &aidr, sizeof (aidr));
		error = mptcp_getassocids(mpte, &aidr.sar_cnt,
		    aidr.sar_aidp);
		if (error == 0)
			bcopy(&aidr, data, sizeof (aidr));
		break;
	}

	case SIOCGCONNIDS32: {		/* struct so_cidreq32 */
		struct so_cidreq32 cidr;
		bcopy(data, &cidr, sizeof (cidr));
		error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
		    cidr.scr_cidp);
		if (error == 0)
			bcopy(&cidr, data, sizeof (cidr));
		break;
	}

	case SIOCGCONNIDS64: {		/* struct so_cidreq64 */
		struct so_cidreq64 cidr;
		bcopy(data, &cidr, sizeof (cidr));
		error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
		    cidr.scr_cidp);
		if (error == 0)
			bcopy(&cidr, data, sizeof (cidr));
		break;
	}

	case SIOCGCONNINFO32: {		/* struct so_cinforeq32 */
		struct so_cinforeq32 cifr;
		bcopy(data, &cifr, sizeof (cifr));
		error = mptcp_getconninfo(mpte, &cifr.scir_cid,
		    &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
		    cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
		    &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
		    &cifr.scir_aux_len);
		if (error == 0)
			bcopy(&cifr, data, sizeof (cifr));
		break;
	}

	case SIOCGCONNINFO64: {		/* struct so_cinforeq64 */
		struct so_cinforeq64 cifr;
		bcopy(data, &cifr, sizeof (cifr));
		error = mptcp_getconninfo(mpte, &cifr.scir_cid,
		    &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
		    cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
		    &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
		    &cifr.scir_aux_len);
		if (error == 0)
			bcopy(&cifr, data, sizeof (cifr));
		break;
	}

	case SIOCSCONNORDER: {		/* struct so_cordreq */
		struct so_cordreq cor;
		bcopy(data, &cor, sizeof (cor));
		error = mptcp_setconnorder(mpte, cor.sco_cid, cor.sco_rank);
		if (error == 0)
			bcopy(&cor, data, sizeof (cor));
		break;
	}

	case SIOCGCONNORDER: {		/* struct so_cordreq */
		struct so_cordreq cor;
		bcopy(data, &cor, sizeof (cor));
		error = mptcp_getconnorder(mpte, cor.sco_cid, &cor.sco_rank);
		if (error == 0)
			bcopy(&cor, data, sizeof (cor));
		break;
	}

	default:
		error = EOPNOTSUPP;
		break;
	}
out:
	return (error);
}

/*
 * Initiate a disconnect.  MPTCP-level disconnection is specified by
 * CONNID_{ANY,ALL}.  Otherwise, selectively disconnect a subflow
 * connection while keeping the MPTCP-level connection (association).
 */
static int
mptcp_disconnectx(struct mptses *mpte, sae_associd_t aid, sae_connid_t cid)
{
	struct mptsub *mpts;
	struct socket *mp_so;
	struct mptcb *mp_tp;
	int error = 0;

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	mp_so = mpte->mpte_mppcb->mpp_socket;
	mp_tp = mpte->mpte_mptcb;

	mptcplog((LOG_DEBUG, "MPTCP Socket: "
	    "%s: mp_so 0x%llx aid %d cid %d %d\n", __func__,
	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), aid, cid, mp_so->so_error),
	    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);

	DTRACE_MPTCP5(disconnectx, struct mptses *, mpte, sae_associd_t, aid,
	    sae_connid_t, cid, struct socket *, mp_so, struct mptcb *, mp_tp);

	VERIFY(aid == SAE_ASSOCID_ANY || aid == SAE_ASSOCID_ALL ||
	    aid == mpte->mpte_associd);

	/* terminate the association? */
	if (cid == SAE_CONNID_ANY || cid == SAE_CONNID_ALL) {
		/* if we're not detached, go thru socket state checks */
		if (!(mp_so->so_flags & SOF_PCBCLEARING)) {
			if (!(mp_so->so_state & (SS_ISCONNECTED|
			    SS_ISCONNECTING))) {
				error = ENOTCONN;
				goto out;
			}
			if (mp_so->so_state & SS_ISDISCONNECTING) {
				error = EALREADY;
				goto out;
			}
		}
		MPT_LOCK(mp_tp);
		mptcp_cancel_all_timers(mp_tp);
		if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
			(void) mptcp_close(mpte, mp_tp);
			MPT_UNLOCK(mp_tp);
		} else if ((mp_so->so_options & SO_LINGER) &&
		    mp_so->so_linger == 0) {
			(void) mptcp_drop(mpte, mp_tp, 0);
			MPT_UNLOCK(mp_tp);
		} else {
			MPT_UNLOCK(mp_tp);
			soisdisconnecting(mp_so);
			sbflush(&mp_so->so_rcv);
			if (mptcp_usrclosed(mpte) != NULL)
				(void) mptcp_output(mpte);
		}
	} else {
		bool disconnect_embryonic_subflows = false;
		struct socket *so = NULL;

		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
			if (mpts->mpts_connid != cid)
				continue;

			MPTS_LOCK(mpts);
			/*
			 * Check if disconnected subflow is the one used
			 * to initiate MPTCP connection.
			 * If it is and the connection is not yet join ready
			 * disconnect all other subflows.
			 */
			so = mpts->mpts_socket;
			if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && 
			    so && !(so->so_flags & SOF_MP_SEC_SUBFLOW)) {
				disconnect_embryonic_subflows = true;
			}

			mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
			mptcp_subflow_disconnect(mpte, mpts, FALSE);
			MPTS_UNLOCK(mpts);
			break;
		}

		if (mpts == NULL) {
			error = EINVAL;
			goto out;
		}
		
		if (disconnect_embryonic_subflows) {
			TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
				if (mpts->mpts_connid == cid)
					continue;
				MPTS_LOCK(mpts);
				mptcp_subflow_disconnect(mpte, mpts, TRUE);
				MPTS_UNLOCK(mpts);
			}
		}
	}

	if (error == 0)
		mptcp_thread_signal(mpte);

	if ((mp_so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
	    (SS_CANTRCVMORE | SS_CANTSENDMORE)) {
		/* the socket has been shutdown, no more sockopt's */
		mptcp_flush_sopts(mpte);
	}

out:
	return (error);
}

/*
 * Wrapper function to support disconnect on socket 
 */
static int
mptcp_usr_disconnect(struct socket *mp_so)
{
	int error = 0;

	error = mptcp_usr_disconnectx(mp_so, SAE_ASSOCID_ALL, SAE_CONNID_ALL);
	return (error);
}

/*
 * User-protocol pru_disconnectx callback.
 */
static int
mptcp_usr_disconnectx(struct socket *mp_so, sae_associd_t aid, sae_connid_t cid)
{
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte;
	int error = 0;

	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
		error = EINVAL;
		goto out;
	}
	mpte = mptompte(mpp);
	VERIFY(mpte != NULL);
	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL &&
	    aid != mpte->mpte_associd) {
		error = EINVAL;
		goto out;
	}

	error = mptcp_disconnectx(mpte, aid, cid);
out:
	return (error);
}

/*
 * User issued close, and wish to trail thru shutdown states.
 */
static struct mptses *
mptcp_usrclosed(struct mptses *mpte)
{
	struct socket *mp_so;
	struct mptcb *mp_tp;
	struct mptsub *mpts;

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
	mp_so = mpte->mpte_mppcb->mpp_socket;
	mp_tp = mpte->mpte_mptcb;

	MPT_LOCK(mp_tp);
	mptcp_close_fsm(mp_tp, MPCE_CLOSE);

	if (mp_tp->mpt_state == MPTCPS_CLOSED) {
		mpte = mptcp_close(mpte, mp_tp);
		MPT_UNLOCK(mp_tp);
	} else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
		MPT_UNLOCK(mp_tp);
		soisdisconnected(mp_so);
		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
			MPTS_LOCK(mpts);
			mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
			MPTS_UNLOCK(mpts);
		}
	} else {
		MPT_UNLOCK(mp_tp);

		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
			MPTS_LOCK(mpts);
			mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
			mptcp_subflow_disconnect(mpte, mpts, FALSE);
			MPTS_UNLOCK(mpts);
		}
	}

	return (mpte);
}

/*
 * User-protocol pru_peeloff callback.
 */
static int
mptcp_usr_peeloff(struct socket *mp_so, sae_associd_t aid, struct socket **psop)
{
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte;
	int error = 0;

	VERIFY(psop != NULL);

	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
		error = EINVAL;
		goto out;
	}
	mpte = mptompte(mpp);
	VERIFY(mpte != NULL);

	error = mptcp_peeloff(mpte, aid, psop);
out:
	return (error);
}

/*
 * Transform a previously connected TCP subflow connection which has
 * failed to negotiate MPTCP to its own socket which can be externalized
 * with a file descriptor.  Valid only when the MPTCP socket is not
 * yet associated (MPTCP-level connection has not been established.)
 */
static int
mptcp_peeloff(struct mptses *mpte, sae_associd_t aid, struct socket **psop)
{
	struct socket *so = NULL, *mp_so;
	struct mptsub *mpts;
	int error = 0;

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
	mp_so = mpte->mpte_mppcb->mpp_socket;

	VERIFY(psop != NULL);
	*psop = NULL;

	DTRACE_MPTCP3(peeloff, struct mptses *, mpte, sae_associd_t, aid,
	    struct socket *, mp_so);

	/* peeloff cannot happen after an association is established */
	if (mpte->mpte_associd != SAE_ASSOCID_ANY) {
		error = EINVAL;
		goto out;
	}

	if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL) {
		error = EINVAL;
		goto out;
	}

	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
		MPTS_LOCK(mpts);
		if (mpts->mpts_flags & MPTSF_MP_CAPABLE) {
			panic("%s: so %p is MPTCP capable but mp_so %p "
			    "aid is %d\n", __func__, so, mp_so,
			    mpte->mpte_associd);
			/* NOTREACHED */
		}
		MPTS_ADDREF_LOCKED(mpts);	/* for us */
		so = mpts->mpts_socket;
		VERIFY(so != NULL);
		/*
		 * This subflow socket is about to be externalized; make it
		 * appear as if it has the same properties as the MPTCP socket,
		 * undo what's done earlier in mptcp_subflow_add().
		 */
		mptcp_subflow_sopeeloff(mpte, mpts, so);
		MPTS_UNLOCK(mpts);

		mptcp_subflow_del(mpte, mpts, FALSE);
		MPTS_REMREF(mpts);		/* ours */
		/*
		 * XXX adi@apple.com
		 *
		 * Here we need to make sure the subflow socket is not
		 * flow controlled; need to clear both INP_FLOW_CONTROLLED
		 * and INP_FLOW_SUSPENDED on the subflow socket, since
		 * we will no longer be monitoring its events.
		 */
		break;
	}

	if (so == NULL) {
		error = EINVAL;
		goto out;
	}
	*psop = so;

	mptcplog((LOG_DEBUG, "MPTCP Socket: "
	    "%s: mp_so 0x%llx\n", __func__,
	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
	    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);

out:
	return (error);
}

/*
 * After a receive, possible send some update to peer.
 */
static int
mptcp_usr_rcvd(struct socket *mp_so, int flags)
{
#pragma unused(flags)
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte;
	int error = 0;

	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
		error = EINVAL;
		goto out;
	}
	mpte = mptompte(mpp);
	VERIFY(mpte != NULL);

	error = mptcp_output(mpte);
out:
	return (error);
}

/*
 * Do a send by putting data in the output queue.
 */
static int
mptcp_usr_send(struct socket *mp_so, int prus_flags, struct mbuf *m,
    struct sockaddr *nam, struct mbuf *control, struct proc *p)
{
#pragma unused(nam, p)
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte;
	int error = 0;

	if (prus_flags & (PRUS_OOB|PRUS_EOF)) {
		error = EOPNOTSUPP;
		goto out;
	}

	if (nam != NULL) {
		error = EOPNOTSUPP;
		goto out;
	}

	if (control != NULL && control->m_len != 0) {
		error = EOPNOTSUPP;
		goto out;
	}

	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
		error = ECONNRESET;
		goto out;
	}
	mpte = mptompte(mpp);
	VERIFY(mpte != NULL);

	if (!(mp_so->so_state & SS_ISCONNECTED) &&
	     (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA))) {
		error = ENOTCONN;
		goto out;
	}

	mptcp_insert_dsn(mpp, m);
	VERIFY(mp_so->so_snd.sb_flags & SB_NOCOMPRESS);
	(void) sbappendstream(&mp_so->so_snd, m);
	m = NULL;

	/*
	 * XXX: adi@apple.com
	 *
	 * PRUS_MORETOCOME could be set, but we don't check it now.
	 */
	error = mptcp_output(mpte);
	if (error != 0)
		goto out;

	if (mp_so->so_state & SS_ISCONNECTING) {
		if (mp_so->so_state & SS_NBIO)
			error = EWOULDBLOCK;
		else
			error = sbwait(&mp_so->so_snd);
	}

out:
	if (error) {
		if (m != NULL)
			m_freem(m);
		if (control != NULL)
			m_freem(control);
	}
	return (error);
}

/*
 * Mark the MPTCP connection as being incapable of further output.
 */
static int
mptcp_usr_shutdown(struct socket *mp_so)
{
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte;
	int error = 0;

	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
		error = EINVAL;
		goto out;
	}
	mpte = mptompte(mpp);
	VERIFY(mpte != NULL);

	socantsendmore(mp_so);

	mpte = mptcp_usrclosed(mpte);
	if (mpte != NULL)
		error = mptcp_output(mpte);
out:
	return (error);
}

/*
 * Copy the contents of uio into a properly sized mbuf chain.
 */
static int
mptcp_uiotombuf(struct uio *uio, int how, int space, uint32_t align,
    struct mbuf **top)
{
	struct mbuf *m, *mb, *nm = NULL, *mtail = NULL;
	user_ssize_t resid, tot, len, progress;	/* must be user_ssize_t */
	int error;

	VERIFY(top != NULL && *top == NULL);

	/*
	 * space can be zero or an arbitrary large value bound by
	 * the total data supplied by the uio.
	 */
	resid = uio_resid(uio);
	if (space > 0)
		tot = imin(resid, space);
	else
		tot = resid;

	/*
	 * The smallest unit is a single mbuf with pkthdr.
	 * We can't align past it.
	 */
	if (align >= MHLEN)
		return (EINVAL);

	/*
	 * Give us the full allocation or nothing.
	 * If space is zero return the smallest empty mbuf.
	 */
	if ((len = tot + align) == 0)
		len = 1;

	/* Loop and append maximum sized mbufs to the chain tail. */
	while (len > 0) {
		uint32_t m_needed = 1;

		if (njcl > 0 && len > MBIGCLBYTES)
			mb = m_getpackets_internal(&m_needed, 1,
			    how, 1, M16KCLBYTES);
		else if (len > MCLBYTES)
			mb = m_getpackets_internal(&m_needed, 1,
			    how, 1, MBIGCLBYTES);
		else if (len >= (signed)MINCLSIZE)
			mb = m_getpackets_internal(&m_needed, 1,
			    how, 1, MCLBYTES);
		else
			mb = m_gethdr(how, MT_DATA);

		/* Fail the whole operation if one mbuf can't be allocated. */
		if (mb == NULL) {
			if (nm != NULL)
				m_freem(nm);
			return (ENOBUFS);
		}

		/* Book keeping. */
		VERIFY(mb->m_flags & M_PKTHDR);
		len -= ((mb->m_flags & M_EXT) ? mb->m_ext.ext_size : MHLEN);
		if (mtail != NULL)
			mtail->m_next = mb;
		else
			nm = mb;
		mtail = mb;
	}

	m = nm;
	m->m_data += align;

	progress = 0;
	/* Fill all mbufs with uio data and update header information. */
	for (mb = m; mb != NULL; mb = mb->m_next) {
		len = imin(M_TRAILINGSPACE(mb), tot - progress);

		error = uiomove(mtod(mb, char *), len, uio);
		if (error != 0) {
			m_freem(m);
			return (error);
		}

		/* each mbuf is M_PKTHDR chained via m_next */
		mb->m_len = len;
		mb->m_pkthdr.len = len;

		progress += len;
	}
	VERIFY(progress == tot);
	*top = m;
	return (0);
}

/*
 * MPTCP socket protocol-user socket send routine, derived from sosend().
 */
static int
mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio,
    struct mbuf *top, struct mbuf *control, int flags)
{
#pragma unused(addr)
	int32_t space;
	user_ssize_t resid;
	int error, sendflags;
	struct proc *p = current_proc();
	int sblocked = 0;

	/* UIO is required for now, due to per-mbuf M_PKTHDR constrains */
	if (uio == NULL || top != NULL) {
		error = EINVAL;
		goto out;
	}
	resid = uio_resid(uio);

	socket_lock(mp_so, 1);
	so_update_last_owner_locked(mp_so, p);
	so_update_policy(mp_so);

	VERIFY(mp_so->so_type == SOCK_STREAM);
	VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW));

	if ((flags & (MSG_OOB|MSG_DONTROUTE|MSG_HOLD|MSG_SEND|MSG_FLUSH)) ||
	    (mp_so->so_flags & SOF_ENABLE_MSGS)) {
		error = EOPNOTSUPP;
		socket_unlock(mp_so, 1);
		goto out;
	}

	/*
	 * In theory resid should be unsigned.  However, space must be
	 * signed, as it might be less than 0 if we over-committed, and we
	 * must use a signed comparison of space and resid.  On the other
	 * hand, a negative resid causes us to loop sending 0-length
	 * segments to the protocol.
	 */
	if (resid < 0 || (flags & MSG_EOR) || control != NULL) {
		error = EINVAL;
		socket_unlock(mp_so, 1);
		goto out;
	}

	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);

	do {
		error = sosendcheck(mp_so, NULL, resid, 0, 0, flags,
		    &sblocked, NULL);
		if (error != 0)
			goto release;

		space = sbspace(&mp_so->so_snd);
		do {
			socket_unlock(mp_so, 0);
			/*
			 * Copy the data from userland into an mbuf chain.
			 */
			error = mptcp_uiotombuf(uio, M_WAITOK, space, 0, &top);
			if (error != 0) {
				socket_lock(mp_so, 0);
				goto release;
			}
			VERIFY(top != NULL);
			space -= resid - uio_resid(uio);
			resid = uio_resid(uio);
			socket_lock(mp_so, 0);

			/*
			 * Compute flags here, for pru_send and NKEs.
			 */
			sendflags = (resid > 0 && space > 0) ?
			    PRUS_MORETOCOME : 0;

			/*
			 * Socket filter processing
			 */
			VERIFY(control == NULL);
			error = sflt_data_out(mp_so, NULL, &top, &control, 0);
			if (error != 0) {
				if (error == EJUSTRETURN) {
					error = 0;
					top = NULL;
					/* always free control if any */
				}
				goto release;
			}
			if (control != NULL) {
				m_freem(control);
				control = NULL;
			}

			/*
			 * Pass data to protocol.
			 */
			error = (*mp_so->so_proto->pr_usrreqs->pru_send)
			    (mp_so, sendflags, top, NULL, NULL, p);

			top = NULL;
			if (error != 0)
				goto release;
		} while (resid != 0 && space > 0);
	} while (resid != 0);

release:
	if (sblocked)
		sbunlock(&mp_so->so_snd, FALSE); /* will unlock socket */
	else
		socket_unlock(mp_so, 1);
out:
	if (top != NULL)
		m_freem(top);
	if (control != NULL)
		m_freem(control);

	/* clear SOF1_PRECONNECT_DATA after one write */
	if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
		mp_so->so_flags1 &= ~SOF1_PRECONNECT_DATA;

	return (error);
}

/*
 * Called to filter SOPT_{SET,GET} for SOL_SOCKET level socket options.
 * This routine simply indicates to the caller whether or not to proceed
 * further with the given socket option.  This is invoked by sosetoptlock()
 * and sogetoptlock().
 */
static int
mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt)
{
#pragma unused(mp_so)
	int error = 0;

	VERIFY(sopt->sopt_level == SOL_SOCKET);

	/*
	 * We could check for sopt_dir (set/get) here, but we'll just
	 * let the caller deal with it as appropriate; therefore the
	 * following is a superset of the socket options which we
	 * allow for set/get.
	 *
	 * XXX: adi@apple.com
	 *
	 * Need to consider the following cases:
	 *
	 *   a. In the event peeloff(2) occurs on the subflow socket,
	 *	we may want to issue those options which are now
	 *	handled at the MP socket.  In that case, we will need
	 *	to record them in mptcp_setopt() so that they can
	 *	be replayed during peeloff.
	 *
	 *   b.	Certain socket options don't have a clear definition
	 *	on the expected behavior post connect(2).  At the time
	 *	those options are issued on the MP socket, there may
	 *	be existing subflow sockets that are already connected.
	 */
	switch (sopt->sopt_name) {
	case SO_LINGER:				/* MP */
	case SO_LINGER_SEC:			/* MP */
	case SO_TYPE:				/* MP */
	case SO_NREAD:				/* MP */
	case SO_NWRITE:				/* MP */
	case SO_ERROR:				/* MP */
	case SO_SNDBUF:				/* MP */
	case SO_RCVBUF:				/* MP */
	case SO_SNDLOWAT:			/* MP */
	case SO_RCVLOWAT:			/* MP */
	case SO_SNDTIMEO:			/* MP */
	case SO_RCVTIMEO:			/* MP */
	case SO_NKE:				/* MP */
	case SO_NOSIGPIPE:			/* MP */
	case SO_NOADDRERR:			/* MP */
	case SO_LABEL:				/* MP */
	case SO_PEERLABEL:			/* MP */
	case SO_DEFUNCTOK:			/* MP */
	case SO_ISDEFUNCT:			/* MP */
	case SO_TRAFFIC_CLASS_DBG:		/* MP */
		/*
		 * Tell the caller that these options are to be processed.
		 */
		break;

	case SO_DEBUG:				/* MP + subflow */
	case SO_KEEPALIVE:			/* MP + subflow */
	case SO_USELOOPBACK:			/* MP + subflow */
	case SO_RANDOMPORT:			/* MP + subflow */
	case SO_TRAFFIC_CLASS:			/* MP + subflow */
	case SO_RECV_TRAFFIC_CLASS:		/* MP + subflow */
	case SO_PRIVILEGED_TRAFFIC_CLASS:	/* MP + subflow */
	case SO_RECV_ANYIF:			/* MP + subflow */
	case SO_RESTRICTIONS:			/* MP + subflow */
	case SO_FLUSH:				/* MP + subflow */
	case SO_MPTCP_FASTJOIN:			/* MP + subflow */
	case SO_NOWAKEFROMSLEEP:
	case SO_NOAPNFALLBK:
		/*
		 * Tell the caller that these options are to be processed;
		 * these will also be recorded later by mptcp_setopt().
		 *
		 * NOTE: Only support integer option value for now.
		 */
		if (sopt->sopt_valsize != sizeof (int))
			error = EINVAL;
		break;

	default:
		/*
		 * Tell the caller to stop immediately and return an error.
		 */
		error = ENOPROTOOPT;
		break;
	}

	return (error);
}

/*
 * Issue SOPT_SET for all MPTCP subflows (for integer option values.)
 */
static int
mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo)
{
	struct socket *mp_so;
	struct mptsub *mpts;
	struct mptopt smpo;
	int error = 0;

	/* just bail now if this isn't applicable to subflow sockets */
	if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
		error = ENOPROTOOPT;
		goto out;
	}

	/*
	 * Skip those that are handled internally; these options
	 * should not have been recorded and marked with the
	 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
	 */
	if (mpo->mpo_level == SOL_SOCKET &&
	    (mpo->mpo_name == SO_NOSIGPIPE || mpo->mpo_name == SO_NOADDRERR)) {
		error = ENOPROTOOPT;
		goto out;
	}

	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
	mp_so = mpte->mpte_mppcb->mpp_socket;

	/*
	 * Don't bother going further if there's no subflow; mark the option
	 * with MPOF_INTERIM so that we know whether or not to remove this
	 * option upon encountering an error while issuing it during subflow
	 * socket creation.
	 */
	if (mpte->mpte_numflows == 0) {
		VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows));
		mpo->mpo_flags |= MPOF_INTERIM;
		/* return success */
		goto out;
	}

	bzero(&smpo, sizeof (smpo));
	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
	smpo.mpo_level = mpo->mpo_level;
	smpo.mpo_name = mpo->mpo_name;

	/* grab exisiting values in case we need to rollback */
	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
		struct socket *so;

		MPTS_LOCK(mpts);
		mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
		mpts->mpts_oldintval = 0;
		smpo.mpo_intval = 0;
		VERIFY(mpts->mpts_socket != NULL);
		so = mpts->mpts_socket;
		socket_lock(so, 0);
		if (mptcp_subflow_sogetopt(mpte, so, &smpo) == 0) {
			mpts->mpts_flags |= MPTSF_SOPT_OLDVAL;
			mpts->mpts_oldintval = smpo.mpo_intval;
		}
		socket_unlock(so, 0);
		MPTS_UNLOCK(mpts);
	}

	/* apply socket option */
	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
		struct socket *so;

		MPTS_LOCK(mpts);
		mpts->mpts_flags |= MPTSF_SOPT_INPROG;
		VERIFY(mpts->mpts_socket != NULL);
		so = mpts->mpts_socket;
		socket_lock(so, 0);
		error = mptcp_subflow_sosetopt(mpte, so, mpo);
		socket_unlock(so, 0);
		MPTS_UNLOCK(mpts);
		if (error != 0)
			break;
	}

	/* cleanup, and rollback if needed */
	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
		struct socket *so;

		MPTS_LOCK(mpts);
		if (!(mpts->mpts_flags & MPTSF_SOPT_INPROG)) {
			/* clear in case it's set */
			mpts->mpts_flags &= ~MPTSF_SOPT_OLDVAL;
			mpts->mpts_oldintval = 0;
			MPTS_UNLOCK(mpts);
			continue;
		}
		if (!(mpts->mpts_flags & MPTSF_SOPT_OLDVAL)) {
			mpts->mpts_flags &= ~MPTSF_SOPT_INPROG;
			VERIFY(mpts->mpts_oldintval == 0);
			MPTS_UNLOCK(mpts);
			continue;
		}
		/* error during sosetopt, so roll it back */
		if (error != 0) {
			VERIFY(mpts->mpts_socket != NULL);
			so = mpts->mpts_socket;
			socket_lock(so, 0);
			smpo.mpo_intval = mpts->mpts_oldintval;
			(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
			socket_unlock(so, 0);
		}
		mpts->mpts_oldintval = 0;
		mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
		MPTS_UNLOCK(mpts);
	}

out:
	return (error);
}

/*
 * Handle SOPT_SET for socket options issued on MP socket.
 */
static int
mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
{
	int error = 0, optval, level, optname, rec = 1;
	struct mptopt smpo, *mpo = NULL;
	struct socket *mp_so;
	char buf[32];

	level = sopt->sopt_level;
	optname = sopt->sopt_name;

	VERIFY(sopt->sopt_dir == SOPT_SET);
	VERIFY(level == SOL_SOCKET || level == IPPROTO_TCP);
	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
	mp_so = mpte->mpte_mppcb->mpp_socket;

	/*
	 * Record socket options which are applicable to subflow sockets so
	 * that we can replay them for new ones; see mptcp_usr_socheckopt()
	 * for the list of eligible socket-level options.
	 */
	if (level == SOL_SOCKET) {
		switch (optname) {
		case SO_DEBUG:
		case SO_KEEPALIVE:
		case SO_USELOOPBACK:
		case SO_RANDOMPORT:
		case SO_TRAFFIC_CLASS:
		case SO_RECV_TRAFFIC_CLASS:
		case SO_PRIVILEGED_TRAFFIC_CLASS:
		case SO_RECV_ANYIF:
		case SO_RESTRICTIONS:
		case SO_NOWAKEFROMSLEEP:
		case SO_MPTCP_FASTJOIN:
		case SO_NOAPNFALLBK:
			/* record it */
			break;
		case SO_FLUSH:
			/* don't record it */
			rec = 0;
			break;
		default:
			/* nothing to do; just return success */
			goto out;
		}
	} else {
		switch (optname) {
		case TCP_NODELAY:
		case TCP_RXT_FINDROP:
		case TCP_KEEPALIVE:
		case TCP_KEEPINTVL:
		case TCP_KEEPCNT:
		case TCP_CONNECTIONTIMEOUT:
		case TCP_RXT_CONNDROPTIME:
		case PERSIST_TIMEOUT:
			/* eligible; record it */
			break;
		case TCP_NOTSENT_LOWAT:
			/* record at MPTCP level */
			error = sooptcopyin(sopt, &optval, sizeof(optval),
			    sizeof(optval));
			if (error)
				goto out;
			if (optval < 0) {
				error = EINVAL;
				goto out;
			} else {
				if (optval == 0) {
					mp_so->so_flags &= ~SOF_NOTSENT_LOWAT;
					error = mptcp_set_notsent_lowat(mpte,0);
				} else {
					mp_so->so_flags |= SOF_NOTSENT_LOWAT;
					error = mptcp_set_notsent_lowat(mpte,
					    optval);
				}
			}
			goto out;
		default:
			/* not eligible */
			error = ENOPROTOOPT;
			goto out;
		}
	}

	if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
	    sizeof (optval))) != 0)
		goto out;

	if (rec) {
		/* search for an existing one; if not found, allocate */
		if ((mpo = mptcp_sopt_find(mpte, sopt)) == NULL)
			mpo = mptcp_sopt_alloc(M_WAITOK);

		if (mpo == NULL) {
			error = ENOBUFS;
		} else {
			mptcplog((LOG_DEBUG, "MPTCP Socket: "
			    "%s: mp_so 0x%llx sopt %s "
			    "val %d %s\n", __func__,
			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
			    mptcp_sopt2str(level, optname, buf,
			    sizeof (buf)), optval,
			    (mpo->mpo_flags & MPOF_ATTACHED) ?
			    "updated" : "recorded"),
			    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);

			/* initialize or update, as needed */
			mpo->mpo_intval = optval;
			if (!(mpo->mpo_flags & MPOF_ATTACHED)) {
				mpo->mpo_level = level;
				mpo->mpo_name = optname;
				mptcp_sopt_insert(mpte, mpo);
			}
			VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
			/* this can be issued on the subflow socket */
			mpo->mpo_flags |= MPOF_SUBFLOW_OK;
		}
	} else {
		bzero(&smpo, sizeof (smpo));
		mpo = &smpo;
		mpo->mpo_flags |= MPOF_SUBFLOW_OK;
		mpo->mpo_level = level;
		mpo->mpo_name = optname;
		mpo->mpo_intval = optval;
	}
	VERIFY(mpo == NULL || error == 0);

	/* issue this socket option on existing subflows */
	if (error == 0) {
		error = mptcp_setopt_apply(mpte, mpo);
		if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) {
			VERIFY(mpo != &smpo);
			mptcp_sopt_remove(mpte, mpo);
			mptcp_sopt_free(mpo);
		}
		if (mpo == &smpo)
			mpo->mpo_flags &= ~MPOF_INTERIM;
	}
out:
	if (error == 0 && mpo != NULL) {
		mptcplog((LOG_ERR, "MPTCP Socket: "
		    "%s: mp_so 0x%llx sopt %s val %d set %s\n",
		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
		    mptcp_sopt2str(level, optname, buf,
		    sizeof (buf)), optval, (mpo->mpo_flags & MPOF_INTERIM) ?
		    "pending" : "successful"),
		    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
	} else if (error != 0) {
		mptcplog((LOG_ERR, "MPTCP Socket: "
		    "%s: mp_so 0x%llx sopt %s can't be issued "
		    "error %d\n", __func__,
		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mptcp_sopt2str(level,
		    optname, buf, sizeof (buf)), error),
		    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
	}
	return (error);
}

/*
 * Handle SOPT_GET for socket options issued on MP socket.
 */
static int
mptcp_getopt(struct mptses *mpte, struct sockopt *sopt)
{
	int error = 0, optval;

	VERIFY(sopt->sopt_dir == SOPT_GET);
	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	/*
	 * We only handle SOPT_GET for TCP level socket options; we should
	 * not get here for socket level options since they are already
	 * handled at the socket layer.
	 */
	if (sopt->sopt_level != IPPROTO_TCP) {
		error = ENOPROTOOPT;
		goto out;
	}

	switch (sopt->sopt_name) {
	case TCP_NODELAY:
	case TCP_RXT_FINDROP:
	case TCP_KEEPALIVE:
	case TCP_KEEPINTVL:
	case TCP_KEEPCNT:
	case TCP_CONNECTIONTIMEOUT:
	case TCP_RXT_CONNDROPTIME:
	case PERSIST_TIMEOUT:
	case TCP_NOTSENT_LOWAT:
		/* eligible; get the default value just in case */
		error = mptcp_default_tcp_optval(mpte, sopt, &optval);
		break;
	default:
		/* not eligible */
		error = ENOPROTOOPT;
		break;
	}

	switch (sopt->sopt_name) {
	case TCP_NOTSENT_LOWAT:
		if (mpte->mpte_mppcb->mpp_socket->so_flags & SOF_NOTSENT_LOWAT)
			optval = mptcp_get_notsent_lowat(mpte);
		else
			optval = 0;
		goto out;
	}

	/*
	 * Search for a previously-issued TCP level socket option and
	 * return the recorded option value.  This assumes that the
	 * value did not get modified by the lower layer after it was
	 * issued at setsockopt(2) time.  If not found, we'll return
	 * the default value obtained ealier.
	 */
	if (error == 0) {
		struct mptopt *mpo;

		if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL)
			optval = mpo->mpo_intval;

		error = sooptcopyout(sopt, &optval, sizeof (int));
	}
out:
	return (error);
}

/*
 * Return default values for TCP socket options.  Ideally we would query the
 * subflow TCP socket, but that requires creating a subflow socket before
 * connectx(2) time.  To simplify things, just return the default values
 * that we know of.
 */
static int
mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval)
{
	int error = 0;

	VERIFY(sopt->sopt_level == IPPROTO_TCP);
	VERIFY(sopt->sopt_dir == SOPT_GET);
	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	/* try to do what tcp_newtcpcb() does */
	switch (sopt->sopt_name) {
	case TCP_NODELAY:
	case TCP_RXT_FINDROP:
	case TCP_KEEPINTVL:
	case TCP_KEEPCNT:
	case TCP_CONNECTIONTIMEOUT:
	case TCP_RXT_CONNDROPTIME:
	case TCP_NOTSENT_LOWAT:
		*optval = 0;
		break;

	case TCP_KEEPALIVE:
		*optval = mptcp_subflow_keeptime;
		break;

	case PERSIST_TIMEOUT:
		*optval = tcp_max_persist_timeout;
		break;

	default:
		error = ENOPROTOOPT;
		break;
	}
	return (error);
}

/*
 * MPTCP SOPT_{SET,GET} socket option handler, for options issued on the MP
 * socket, at SOL_SOCKET and IPPROTO_TCP levels.  The former is restricted
 * to those that are allowed by mptcp_usr_socheckopt().
 */
int
mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt)
{
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte;
	int error = 0;

	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
		error = EINVAL;
		goto out;
	}
	mpte = mptompte(mpp);
	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */

	/* we only handle socket and TCP-level socket options for MPTCP */
	if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) {
		char buf[32];
		mptcplog((LOG_DEBUG, "MPTCP Socket: "
		    "%s: mp_so 0x%llx sopt %s level not "
		    "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
		    mptcp_sopt2str(sopt->sopt_level,
		    sopt->sopt_name, buf, sizeof (buf))),
		    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
		error = EINVAL;
		goto out;
	}

	switch (sopt->sopt_dir) {
	case SOPT_SET:
		error = mptcp_setopt(mpte, sopt);
		break;

	case SOPT_GET:
		error = mptcp_getopt(mpte, sopt);
		break;
	}
out:
	return (error);
}

/*
 * Return a string representation of <sopt_level,sopt_name>
 */
const char *
mptcp_sopt2str(int level, int optname, char *dst, int size)
{
	char lbuf[32], obuf[32];
	const char *l = lbuf, *o = obuf;

	(void) snprintf(lbuf, sizeof (lbuf), "0x%x", level);
	(void) snprintf(obuf, sizeof (obuf), "0x%x", optname);

	switch (level) {
	case SOL_SOCKET:
		l = "SOL_SOCKET";
		switch (optname) {
		case SO_LINGER:
			o = "SO_LINGER";
			break;
		case SO_LINGER_SEC:
			o = "SO_LINGER_SEC";
			break;
		case SO_DEBUG:
			o = "SO_DEBUG";
			break;
		case SO_KEEPALIVE:
			o = "SO_KEEPALIVE";
			break;
		case SO_USELOOPBACK:
			o = "SO_USELOOPBACK";
			break;
		case SO_TYPE:
			o = "SO_TYPE";
			break;
		case SO_NREAD:
			o = "SO_NREAD";
			break;
		case SO_NWRITE:
			o = "SO_NWRITE";
			break;
		case SO_ERROR:
			o = "SO_ERROR";
			break;
		case SO_SNDBUF:
			o = "SO_SNDBUF";
			break;
		case SO_RCVBUF:
			o = "SO_RCVBUF";
			break;
		case SO_SNDLOWAT:
			o = "SO_SNDLOWAT";
			break;
		case SO_RCVLOWAT:
			o = "SO_RCVLOWAT";
			break;
		case SO_SNDTIMEO:
			o = "SO_SNDTIMEO";
			break;
		case SO_RCVTIMEO:
			o = "SO_RCVTIMEO";
			break;
		case SO_NKE:
			o = "SO_NKE";
			break;
		case SO_NOSIGPIPE:
			o = "SO_NOSIGPIPE";
			break;
		case SO_NOADDRERR:
			o = "SO_NOADDRERR";
			break;
		case SO_RESTRICTIONS:
			o = "SO_RESTRICTIONS";
			break;
		case SO_LABEL:
			o = "SO_LABEL";
			break;
		case SO_PEERLABEL:
			o = "SO_PEERLABEL";
			break;
		case SO_RANDOMPORT:
			o = "SO_RANDOMPORT";
			break;
		case SO_TRAFFIC_CLASS:
			o = "SO_TRAFFIC_CLASS";
			break;
		case SO_RECV_TRAFFIC_CLASS:
			o = "SO_RECV_TRAFFIC_CLASS";
			break;
		case SO_TRAFFIC_CLASS_DBG:
			o = "SO_TRAFFIC_CLASS_DBG";
			break;
		case SO_PRIVILEGED_TRAFFIC_CLASS:
			o = "SO_PRIVILEGED_TRAFFIC_CLASS";
			break;
		case SO_DEFUNCTOK:
			o = "SO_DEFUNCTOK";
			break;
		case SO_ISDEFUNCT:
			o = "SO_ISDEFUNCT";
			break;
		case SO_OPPORTUNISTIC:
			o = "SO_OPPORTUNISTIC";
			break;
		case SO_FLUSH:
			o = "SO_FLUSH";
			break;
		case SO_RECV_ANYIF:
			o = "SO_RECV_ANYIF";
			break;
		case SO_NOWAKEFROMSLEEP:
			o = "SO_NOWAKEFROMSLEEP";
			break;
		case SO_MPTCP_FASTJOIN:
			o = "SO_MPTCP_FASTJOIN";
			break;
		case SO_NOAPNFALLBK:
			o = "SO_NOAPNFALLBK";
			break;
		}
		break;
	case IPPROTO_TCP:
		l = "IPPROTO_TCP";
		switch (optname) {
		case TCP_KEEPALIVE:
			o = "TCP_KEEPALIVE";
			break;
		case TCP_KEEPINTVL:
			o = "TCP_KEEPINTVL";
			break;
		case TCP_KEEPCNT:
			o = "TCP_KEEPCNT";
			break;
		case TCP_CONNECTIONTIMEOUT:
			o = "TCP_CONNECTIONTIMEOUT";
			break;
		case TCP_RXT_CONNDROPTIME:
			o = "TCP_RXT_CONNDROPTIME";
			break;
		case PERSIST_TIMEOUT:
			o = "PERSIST_TIMEOUT";
			break;
		}
		break;
	}

	(void) snprintf(dst, size, "<%s,%s>", l, o);
	return (dst);
}

static int
mptcp_usr_preconnect(struct socket *mp_so)
{
	struct mptsub *mpts = NULL;
	struct mppcb *mpp = sotomppcb(mp_so);
	struct mptses *mpte;
	struct socket *so;
	struct tcpcb *tp = NULL;

	mpte = mptompte(mpp);
	VERIFY(mpte != NULL);
	MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */

	mpts = mptcp_get_subflow(mpte, NULL, NULL);
	if (mpts == NULL) {
		mptcplog((LOG_ERR, "MPTCP Socket: "
		    "%s: mp_so 0x%llx invalid preconnect ", __func__,
		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
		    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
		return (EINVAL);
	}
	MPTS_LOCK(mpts);
	mpts->mpts_flags &= ~MPTSF_TFO_REQD;
	so = mpts->mpts_socket;
	socket_lock(so, 0);
	tp = intotcpcb(sotoinpcb(so));
	tp->t_mpflags &= ~TMPF_TFO_REQUEST;
	int error = tcp_output(sototcpcb(so));
	socket_unlock(so, 0);
	MPTS_UNLOCK(mpts);
	mp_so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
	return (error);
}